In [23]:
import os
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# All MLB team abbreviations
team_abbreviations = [
    'ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET',
    'HOU', 'KCR', 'LAA', 'LAD', 'MIA', 'MIL', 'MIN', 'NYM', 'NYY', 'OAK',
    'PHI', 'PIT', 'SDP', 'SEA', 'SFG', 'STL', 'TBR', 'TEX', 'TOR', 'WSN'
]

year = 2022
output_folder = "season_win_percentage"
os.makedirs(output_folder, exist_ok=True)

# Set up headless browser
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

def fetch_team_schedule(team_abbr, year):
    url = f"https://www.baseball-reference.com/teams/{team_abbr}/{year}-schedule-scores.shtml"
    driver.get(url)
    time.sleep(2)  # Wait in case content loads slowly
    html = driver.page_source
    html = html.replace("<!--", "").replace("-->", "")

    pattern = re.compile(
        r'<tr data-row="\d+">.*?'
        r'data-stat="date_game"[^>]*?csk="([^"]+)"[^>]*?>.*?</td>.*?'
        r'data-stat="team_ID"[^>]*?>([^<]*)</td>.*?'
        r'data-stat="homeORvis"[^>]*?>([^<]*)</td>.*?'
        r'data-stat="opp_ID"[^>]*?>(?:<a[^>]*>)?([^<]*)</a>?.*?'
        r'data-stat="win_loss_result"[^>]*?>([^<]*)</td>',
        re.DOTALL
    )
    
    matches = pattern.findall(html)
    games = []
    for date, team, home_or_vis, opp, result in matches:
        games.append({
            "date": date,
            "team_ID": team.strip(),
            "home_or_away": "@" if home_or_vis.strip() else "Home",
            "opponent": opp.strip(),
            "result": result.strip()
        })

    if not games:
        return None

    df = pd.DataFrame(games)
    df['win'] = df['result'].apply(lambda x: 1 if x.strip().startswith('W') else 0)
    df['games_played'] = range(1, len(df) + 1)
    df['cumulative_wins'] = df['win'].cumsum()
    df['win_percentage'] = (df['cumulative_wins'] / df['games_played']).round(3)
    df = df.drop(columns=['win', 'cumulative_wins', 'games_played'])
    
    return df

# Loop through teams and save each to CSV
for team in team_abbreviations:
    print(f"Processing {team}...")
    try:
        df_team = fetch_team_schedule(team, year)
        if df_team is not None:
            filename = f"{year}_{team}.csv"
            filepath = os.path.join(output_folder, filename)
            df_team.to_csv(filepath, index=False)
            print(f"Saved {filename}")
        else:
            print(f"No data for {team}")
    except Exception as e:
        print(f"Failed for {team}: {e}")

driver.quit()
print("✅ All done.")


Processing ARI...
Saved 2022_ARI.csv
Processing ATL...
Saved 2022_ATL.csv
Processing BAL...
Saved 2022_BAL.csv
Processing BOS...
Saved 2022_BOS.csv
Processing CHC...
Saved 2022_CHC.csv
Processing CHW...
Saved 2022_CHW.csv
Processing CIN...
Saved 2022_CIN.csv
Processing CLE...
Saved 2022_CLE.csv
Processing COL...
Saved 2022_COL.csv
Processing DET...
Saved 2022_DET.csv
Processing HOU...
Saved 2022_HOU.csv
Processing KCR...
Saved 2022_KCR.csv
Processing LAA...
Saved 2022_LAA.csv
Processing LAD...
Saved 2022_LAD.csv
Processing MIA...
Saved 2022_MIA.csv
Processing MIL...
Saved 2022_MIL.csv
Processing MIN...
Saved 2022_MIN.csv
Processing NYM...
Saved 2022_NYM.csv
Processing NYY...
Saved 2022_NYY.csv
Processing OAK...
Saved 2022_OAK.csv
Processing PHI...
Saved 2022_PHI.csv
Processing PIT...
Saved 2022_PIT.csv
Processing SDP...
Saved 2022_SDP.csv
Processing SEA...
Saved 2022_SEA.csv
Processing SFG...
Saved 2022_SFG.csv
Processing STL...
Saved 2022_STL.csv
Processing TBR...
Saved 2022_TBR.csv
P