In [None]:
import time
import random
import pandas as pd
import os
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# Load only starting pitchers' IDs
pitchers_df = pd.read_csv("Starting_Pitchers_IDs.csv")  # NEW (only starters)
pitchers = pitchers_df["Pitcher_ID"].tolist()

# List of seasons to scrape
seasons = [2020, 2021, 2022, 2023, 2024]

# Set up Chrome options (headless mode for faster scraping)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Runs Chrome in the background
chrome_options.add_argument("--disable-dev-shm-usage")  # Reduces memory usage
chrome_options.add_argument("--disable-gpu")  # Helps prevent crashes
chrome_options.add_argument("--no-sandbox")  # Improves stability
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Function to scrape game logs for a given pitcher and season
def scrape_pitcher_logs(pitcher_id, season):
    url = f"https://www.baseball-reference.com/players/gl.fcgi?id={pitcher_id}&t=p&year={season}"
    
    print(f"🔎 Fetching: {url}")

    # Start a new WebDriver session for each pitcher
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(300)  # Increased timeout to 5 minutes

    try:
        driver.get(url)
        time.sleep(random.uniform(3, 7))  # Reduced wait time

        # Check if the page returned a 404 error
        if "Page Not Found" in driver.page_source or "The page you are looking for" in driver.page_source:
            print(f"⚠️ 404 Error: No game logs found for {pitcher_id} in {season}")
            driver.quit()
            return None  # Skip this pitcher/season

    except Exception as e:
        print(f"⚠️ Page load failed for {pitcher_id} in {season}: {e}")
        driver.quit()
        return None

    # Get page source after loading
    html = driver.page_source
    driver.quit()

    if not html or len(html.strip()) == 0:
        print(f"⚠️ No HTML content found for {pitcher_id} in {season}")
        return None

    soup = BeautifulSoup(html, "html.parser")

    # Find game log table
    table = soup.find("table", {"id": "pitching_gamelogs"})
    if table is None:
        print(f"⚠️ No data table found for {pitcher_id} in {season}")
        return None

    # Convert table to DataFrame
    df = pd.read_html(str(table))[0]

    # Add metadata
    df["Pitcher_ID"] = pitcher_id
    df["Season"] = season

    return df

# Function to scrape multiple pitchers over multiple seasons (parallel processing)
def scrape_all_pitchers_parallel(pitchers, seasons):
    with ThreadPoolExecutor(max_workers=5) as executor:  # Run 10 scrapers at once
        futures = [executor.submit(scrape_pitcher_logs, pitcher, season) for pitcher in pitchers for season in seasons]

    # Collect results and save
    all_data = [future.result() for future in futures if future.result() is not None]

    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        final_df.to_csv("MLB_Pitcher_Game_Logs.csv", index=False)
        print("✅ Data saved to MLB_Pitcher_Game_Logs.csv")
    else:
        print("❌ No new data was collected.")

# Function to ensure all ChromeDriver processes are closed
def cleanup_selenium():
    """Ensures all ChromeDriver processes are stopped."""
    if os.name == "nt":  # Windows
        os.system("taskkill /F /IM chromedriver.exe /T")
    else:  # macOS/Linux
        os.system("pkill -f chromedriver")

# Run Scraper (Parallel Processing)
scrape_all_pitchers_parallel(pitchers, seasons)

# Run the cleanup function at the end of the script
cleanup_selenium()

print("🚀 Scraping complete. All ChromeDriver processes have been closed.")


🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=berrijo01&t=p&year=2020
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=berrijo01&t=p&year=2021
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=berrijo01&t=p&year=2022
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=berrijo01&t=p&year=2023
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=berrijo01&t=p&year=2024
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=gausmke01&t=p&year=2020
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=gausmke01&t=p&year=2021
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=gausmke01&t=p&year=2022
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=gausmke01&t=p&year=2023
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=gausmke01&t=p&year=2024
🔎 Fetching: https://www.baseball-reference.com/players/gl.fcgi?id=bassich01&t=p&year=2020
🔎 Fetching

WebDriverException: Message: Can not connect to the Service /Users/jjbogner/.wdm/drivers/chromedriver/mac64/134.0.6998.88/chromedriver-mac-arm64/chromedriver
