In [23]:
import os
import json
import re
import time
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [6]:
# === Prepare folder ===
output_folder = os.path.join("test_data", "daily_odds")
os.makedirs(output_folder, exist_ok=True)

# === Date range ===
start_date = datetime(2025, 3, 27)
end_date = datetime(2025, 5, 3)

# === Extract JSON helper ===
def extract_json_array(text, start_pattern='[{"league":"MLB"'):
    start_index = text.find(start_pattern)
    if start_index == -1:
        raise ValueError("Start pattern not found")
    bracket_count = 0
    for i in range(start_index, len(text)):
        char = text[i]
        if char == "[":
            bracket_count += 1
        elif char == "]":
            bracket_count -= 1
            if bracket_count == 0:
                return text[start_index:i+1]
    raise ValueError("Matching closing bracket not found")

# === Scraper ===
def scrape_day(date_str):
    url = f'https://www.sportsbookreview.com/betting-odds/mlb-baseball/?date={date_str}'
    
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    time.sleep(6)  # allow full render
    body_html = driver.execute_script("return document.body.innerHTML;")
    driver.quit()
    
    try:
        json_text = extract_json_array(body_html)
        data = json.loads(json_text)
    except Exception as e:
        print(f"Failed on {date_str}: {e}")
        return
    
    all_game_data = []

    for game in data[0]['oddsTableModel']['gameRows']:
        gv = game['gameView']
        base = {
            'gameId': gv['gameId'],
            'startDate': gv['startDate'],
            'homeTeam': gv['homeTeam']['displayName'],
            'awayTeam': gv['awayTeam']['displayName'],
            'homePitcher': f"{gv['homeStarter']['firstName']} {gv['homeStarter']['lastName']}",
            'awayPitcher': f"{gv['awayStarter']['firstName']} {gv['awayStarter']['lastName']}",
            'homeScore': gv.get('homeTeamScore'),
            'awayScore': gv.get('awayTeamScore'),
        }

        odds_views = game.get('oddsViews', [])
        if odds_views:
            for view in odds_views:
                if view is None:
                    continue
                sportsbook = view.get("sportsbook", "unknown")
                opening = view.get("openingLine", {})
                current = view.get("currentLine", {})
                base.update({
                    f"{sportsbook}_opening_homeOdds": opening.get("homeOdds"),
                    f"{sportsbook}_opening_awayOdds": opening.get("awayOdds"),
                    f"{sportsbook}_current_homeOdds": current.get("homeOdds"),
                    f"{sportsbook}_current_awayOdds": current.get("awayOdds"),
                })

        all_game_data.append(base)

    if not all_game_data:
        print(f"No games found on {date_str}")
        return

    df = pd.DataFrame(all_game_data)

    core_cols = ['gameId', 'startDate', 'homeTeam', 'awayTeam',
                 'homePitcher', 'awayPitcher', 'homeScore', 'awayScore']
    odds_cols = [col for col in df.columns if 'homeOdds' in col or 'awayOdds' in col]
    filtered_df = df[core_cols + odds_cols]

    output_path = os.path.join(output_folder, f"{date_str}.csv")
    filtered_df.to_csv(output_path, index=False)
    print(f"Saved {date_str}")

# === Run loop ===
current = start_date
while current <= end_date:
    date_str = current.strftime("%Y-%m-%d")
    output_path = os.path.join(output_folder, f"{date_str}.csv")
    
    if os.path.exists(output_path):
        print(f"Skipping {date_str} (already exists)")
    else:
        scrape_day(date_str)
    
    current += timedelta(days=1)


Saved 2025-03-27
Saved 2025-03-28
Saved 2025-03-29
Failed on 2025-03-30: Start pattern not found
Saved 2025-03-31
Saved 2025-04-01
Saved 2025-04-02
Saved 2025-04-03
Saved 2025-04-04
Saved 2025-04-05
Saved 2025-04-06
Saved 2025-04-07
Saved 2025-04-08
Saved 2025-04-09
Saved 2025-04-10
Saved 2025-04-11
Saved 2025-04-12
Saved 2025-04-13
Saved 2025-04-14
Saved 2025-04-15
Saved 2025-04-16
Saved 2025-04-17
Saved 2025-04-18
Saved 2025-04-19
Saved 2025-04-20
Saved 2025-04-21
Saved 2025-04-22
Saved 2025-04-23
Saved 2025-04-24
Saved 2025-04-25
Saved 2025-04-26
Saved 2025-04-27
Saved 2025-04-28
Saved 2025-04-29
Saved 2025-04-30
Saved 2025-05-01
Saved 2025-05-02
Saved 2025-05-03


In [None]:
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# All MLB team abbreviations
team_abbreviations = [
    'ARI', 'ATL', 'BAL', 'BOS', 'CHC', 'CHW', 'CIN', 'CLE', 'COL', 'DET',
    'HOU', 'KCR', 'LAA', 'LAD', 'MIA', 'MIL', 'MIN', 'NYM', 'NYY', 'ATH',
    'PHI', 'PIT', 'SDP', 'SEA', 'SFG', 'STL', 'TBR', 'TEX', 'TOR', 'WSN'
]

#team_abbreviations = ['PIT', 'SDP', 'SEA', 'SFG', 'STL', 'TBR', 'TEX', 'TOR', 'WSN']

year = 2025
output_folder = os.path.join("test_data", "season_win_percentage")
os.makedirs(output_folder, exist_ok=True)

# Set up Chrome (visible or headless)
options = Options()
# options.add_argument("--headless")  # Uncomment to run headless
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

def fetch_team_schedule(team_abbr, year):
    url = f"https://www.baseball-reference.com/teams/{team_abbr}/{year}-schedule-scores.shtml"
    print(f"🌐 Loading {team_abbr} {year} schedule...")
    driver.get(url)
    time.sleep(2)

    html = driver.page_source
    html = html.replace("<!--", "").replace("-->", "")  # Un-comment HTML content
    soup = BeautifulSoup(html, "html.parser")
    rows = soup.select("table#team_schedule tbody tr")

    data = []
    for row in rows:
        if not row.get("data-row"):
            continue

        date = row.find("td", {"data-stat": "date_game"})
        team = row.find("td", {"data-stat": "team_ID"})
        home_vis = row.find("td", {"data-stat": "homeORvis"})
        opponent = row.find("td", {"data-stat": "opp_ID"})
        result = row.find("td", {"data-stat": "win_loss_result"})
        result_text = result.text.strip() if result else None

        if not result_text or not result_text.startswith(("W", "L")):
            continue

        data.append({
            "date": date["csk"] if date else None,
            "team_ID": team.text.strip() if team else None,
            "home_or_away": "@" if home_vis and home_vis.text.strip() else "Home",
            "opponent": opponent.text.strip() if opponent else None,
            "result": result_text,
        })


    if not data:
        print(f"⚠️ No games found for {team_abbr}")
        return None

    df = pd.DataFrame(data)
    df['win'] = df['result'].apply(lambda x: 1 if x.startswith('W') else 0)
    df['games_played'] = range(1, len(df) + 1)
    df['cumulative_wins'] = df['win'].cumsum()
    df['win_percentage'] = (df['cumulative_wins'] / df['games_played']).round(3)
    df = df.drop(columns=['win', 'cumulative_wins', 'games_played'])

    return df

# Loop through teams and save each to CSV
for team in team_abbreviations:
    try:
        df_team = fetch_team_schedule(team, year)
        if df_team is not None:
            filename = f"{year}_{team}.csv"
            filepath = os.path.join(output_folder, filename)
            df_team.to_csv(filepath, index=False)
            print(f"✅ Saved {filename}")
        else:
            print(f"❌ Skipped {team}: No data")
    except Exception as e:
        print(f"❌ Failed for {team}: {e}")

driver.quit()
print("🎉 Done with all teams.")

🌐 Loading PIT 2025 schedule...
✅ Saved 2025_PIT.csv
🌐 Loading SDP 2025 schedule...
✅ Saved 2025_SDP.csv
🌐 Loading SEA 2025 schedule...
✅ Saved 2025_SEA.csv
🌐 Loading SFG 2025 schedule...
✅ Saved 2025_SFG.csv
🌐 Loading STL 2025 schedule...
✅ Saved 2025_STL.csv
🌐 Loading TBR 2025 schedule...
✅ Saved 2025_TBR.csv
🌐 Loading TEX 2025 schedule...
✅ Saved 2025_TEX.csv
🌐 Loading TOR 2025 schedule...
✅ Saved 2025_TOR.csv
🌐 Loading WSN 2025 schedule...
✅ Saved 2025_WSN.csv
🎉 Done with all teams.


In [10]:
import os
import re
import time
import pandas as pd
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Map team names to MLB abbreviations
team_name_to_abbr = {
    "Chi Cubs": "CHC", "NY Yankees": "NYY", "San Diego": "SDP", "St. Louis": "STL", "LA Dodgers": "LAD",
    "Boston": "BOS", "Miami": "MIA", "Detroit": "DET", "Sacramento": "OAK", "NY Mets": "NYM",
    "Philadelphia": "PHI", "Cincinnati": "CIN", "Seattle": "SEA", "Arizona": "ARI", "Tampa Bay": "TBR",
    "Milwaukee": "MIL", "Atlanta": "ATL", "Toronto": "TOR", "Houston": "HOU", "Washington": "WSN",
    "Minnesota": "MIN", "Cleveland": "CLE", "Kansas City": "KCR", "SF Giants": "SFG", "Baltimore": "BAL",
    "Texas": "TEX", "Pittsburgh": "PIT", "Chi Sox": "CHW", "LA Angels": "LAA", "Colorado": "COL"
}

# Create output folder
output_folder = "batting_data"
output_folder = os.path.join("test_data", "batting_data")
os.makedirs(output_folder, exist_ok=True)

# Set up Selenium browser
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options)

# Date range: May 20 to August 20, 2024
start_date = datetime(2025, 3, 30)
end_date = datetime(2025, 5, 3)
delta = timedelta(days=1)

current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime("%Y-%m-%d")
    print(f"Processing {date_str}...")

    url = f"https://www.teamrankings.com/mlb/stat/batting-average?date={date_str}"
    try:
        driver.get(url)
        time.sleep(3)

        html = driver.page_source

        # Extract team and first batting average value
        pattern = re.compile(
            r'<td class="text-left nowrap" data-sort="[^"]+">\s*<a[^>]*>([^<]+)</a>.*?'
            r'<td class="text-right" data-sort="([\d.]+)">',
            re.DOTALL
        )
        matches = pattern.findall(html)

        if matches:
            df = pd.DataFrame(matches, columns=["Team", "Batting Average"])
            df["Batting Average"] = df["Batting Average"].astype(float)
            df["Date"] = date_str
            df["Team Abbreviation"] = df["Team"].map(team_name_to_abbr)

            # Save to CSV
            df.to_csv(os.path.join(output_folder, f"{date_str}.csv"), index=False)
        else:
            print(f"⚠️ No data found for {date_str}")

    except Exception as e:
        print(f"❌ Error on {date_str}: {e}")

    current_date += delta

driver.quit()
print("✅ Done scraping all dates.")


Processing 2025-03-30...
Processing 2025-03-31...
Processing 2025-04-01...
Processing 2025-04-02...
Processing 2025-04-03...
Processing 2025-04-04...
Processing 2025-04-05...
Processing 2025-04-06...
Processing 2025-04-07...
Processing 2025-04-08...
Processing 2025-04-09...
Processing 2025-04-10...
Processing 2025-04-11...
Processing 2025-04-12...
Processing 2025-04-13...
Processing 2025-04-14...
Processing 2025-04-15...
Processing 2025-04-16...
Processing 2025-04-17...
Processing 2025-04-18...
Processing 2025-04-19...
Processing 2025-04-20...
Processing 2025-04-21...
Processing 2025-04-22...
Processing 2025-04-23...
Processing 2025-04-24...
Processing 2025-04-25...
Processing 2025-04-26...
Processing 2025-04-27...
Processing 2025-04-28...
Processing 2025-04-29...
Processing 2025-04-30...
Processing 2025-05-01...
Processing 2025-05-02...
Processing 2025-05-03...
✅ Done scraping all dates.


In [13]:
import os
import pandas as pd

# Folder containing daily odds CSV files
folder_path = 'test_data/daily_odds'

# Set to store unique pitcher names
unique_pitchers = set()

# Iterate through all CSV files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        try:
            df = pd.read_csv(file_path)
            if 'homePitcher' in df.columns and 'awayPitcher' in df.columns:
                unique_pitchers.update(df['homePitcher'].dropna().unique())
                unique_pitchers.update(df['awayPitcher'].dropna().unique())
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Convert to sorted list for presentation
unique_pitchers = sorted(unique_pitchers)
pitcher_df = pd.DataFrame(unique_pitchers, columns=["Pitcher Name"])
pitcher_df

Unnamed: 0,Pitcher Name
0,AJ Blubaugh
1,AJ Smith-Shawver
2,Aaron Nola
3,Andre Pallante
4,Andrew Abbott
...,...
195,Yusei Kikuchi
196,Zac Gallen
197,Zach Eflin
198,Zack Littell


In [28]:
import requests
import pandas as pd
from tqdm import tqdm

# Original pitcher names
names = list(pitcher_df['Pitcher Name'])
normalized_names = [name.lower().replace(" ", "") for name in names]

# Store results and failures
all_players = []
failed_lookups = []

# Loop with tqdm progress bar
for original_name, normalized_name in tqdm(zip(names, normalized_names), total=len(names), desc="Looking up pitchers"):
    url = f"https://statsapi.mlb.com/api/v1/people/search?names={normalized_name}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        players = data.get("people", [])
        
        if players:
            for player in players:
                player["input_name"] = original_name
                all_players.append(player)
        else:
            failed_lookups.append(original_name)
    except Exception as e:
        failed_lookups.append(original_name)

# Convert results to DataFrame
df_players = pd.DataFrame(all_players)

# Show relevant columns
if not df_players.empty:
    print(df_players[["id", "fullName", "input_name"]])

# Report failures
if failed_lookups:
    print("\n❌ Names that returned no results:")
    for name in failed_lookups:
        print(f" - {name}")

Looking up pitchers: 100%|██████████| 200/200 [02:02<00:00,  1.63it/s]

         id          fullName        input_name
0    805123       AJ Blubaugh       AJ Blubaugh
1    700363  AJ Smith-Shawver  AJ Smith-Shawver
2    605400        Aaron Nola        Aaron Nola
3    669467    Andre Pallante    Andre Pallante
4    671096     Andrew Abbott     Andrew Abbott
..      ...               ...               ...
206  579328     Yusei Kikuchi     Yusei Kikuchi
207  668678        Zac Gallen        Zac Gallen
208  621107        Zach Eflin        Zach Eflin
209  641793      Zack Littell      Zack Littell
210  554430      Zack Wheeler      Zack Wheeler

[211 rows x 3 columns]





In [32]:
# IDs to discard
ids_to_remove = [543941, 119424, 543639, 121350, 450210, 663531, 699127, 112116, 514630, 691910, 622066]

# Filter out those rows
df_players = df_players[~df_players['id'].isin(ids_to_remove)].reset_index(drop=True)

In [39]:
clean_df = df_players[['fullName', 'id']]
clean_df['Year'] = 2025
clean_df.rename(columns={'fullName': 'Pitcher Name'}, inplace=True)
clean_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Year'] = 2025
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.rename(columns={'fullName': 'Pitcher Name'}, inplace=True)


Unnamed: 0,Pitcher Name,id,Year
0,AJ Blubaugh,805123,2025
1,AJ Smith-Shawver,700363,2025
2,Aaron Nola,605400,2025
3,Andre Pallante,669467,2025
4,Andrew Abbott,671096,2025
...,...,...,...
195,Yusei Kikuchi,579328,2025
196,Zac Gallen,668678,2025
197,Zach Eflin,621107,2025
198,Zack Littell,641793,2025


In [40]:
import os
import time
import re
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Ensure output folder exists
output_dir = "test_data/raw_pitcher_data"
os.makedirs(output_dir, exist_ok=True)

# List to track errors
failed_downloads = []

# Setup Selenium headless browser
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options)

# Loop with progress bar
for idx, row in tqdm(clean_df.iterrows(), total=len(clean_df), desc="Scraping pitchers"):
    name = row["Pitcher Name"]
    player_id = row["id"]
    year = row["Year"]

    if pd.isna(name) or pd.isna(player_id) or pd.isna(year):
        continue

    # Format name: lowercase, spaces to hyphens
    url_name = re.sub(r"\s+", "-", name.strip().lower())

    url = f"https://www.mlb.com/player/{url_name}-{int(player_id)}?stats=gamelogs-r-pitching-mlb&year={int(year)}"

    try:
        driver.get(url)
        time.sleep(5)  # Let the page load
        html = driver.page_source

        filename = f"{url_name}_{int(year)}.html"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(html)

    except Exception as e:
        failed_downloads.append(f"{name} ({year})")
        continue

# Clean up
driver.quit()

# Report
if failed_downloads:
    print("\n⚠️ Failed Downloads:")
    for entry in failed_downloads:
        print(f" - {entry}")
else:
    print("\n✅ All pitcher HTML files successfully saved.")


Scraping pitchers: 100%|██████████| 200/200 [21:57<00:00,  6.59s/it]



✅ All pitcher HTML files successfully saved.


In [41]:
import os
import re
import pandas as pd
from tqdm import tqdm

def extract_pitcher_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    filename = os.path.basename(file_path)
    name, year = filename.replace(".html", "").rsplit("_", 1)
    player_name = name.replace("-", " ").title()

    row_pattern = re.compile(r'<tr data-index="\d+">(.*?)</tr>', re.DOTALL)
    rows = row_pattern.findall(html)

    def extract_col(row_html, col_index):
        if col_index == 0:
            match = re.search(rf'data-col="{col_index}".*?<a[^>]*>([^<]+)</a>', row_html, re.DOTALL)
        else:
            match = re.search(rf'data-col="{col_index}".*?<span>([^<]+)</span>', row_html, re.DOTALL)
        return match.group(1).strip() if match else None

    data = []
    for row in rows:
        date = extract_col(row, 0)
        team = extract_col(row, 1)
        opponent = extract_col(row, 2)
        era = extract_col(row, 5)
        if team and re.fullmatch(r"[A-Z]{3}", team):
            data.append((date, team, opponent, era, player_name, year))

    return pd.DataFrame(data, columns=["Date", "Team", "Opponent", "ERA", "Player", "Year"])

# Folder and file setup
folder = "test_data/raw_pitcher_data"
all_dfs = []

# Progress bar
for file in tqdm(sorted(os.listdir(folder))):
    if file.endswith(".html"):
        path = os.path.join(folder, file)
        try:
            df = extract_pitcher_data(path)
            all_dfs.append(df)
        except Exception as e:
            print(f"❌ Failed to process {file}: {e}")

# Combine and save
final_df = pd.concat(all_dfs, ignore_index=True)
final_df.to_csv("combined_pitcher_gamelogs.csv", index=False)
print("✅ Saved to combined_pitcher_gamelogs.csv")


100%|██████████| 200/200 [00:00<00:00, 413.18it/s]

✅ Saved to combined_pitcher_gamelogs.csv





MOVING ON TO BUILDING THE MASTER TEST

In [35]:
team_name_to_abbr = {
    "Chi. Cubs": "CHC",
    "Pittsburgh": "PIT",
    "Miami": "MIA",
    "Baltimore": "BAL",
    "Philadelphia": "PHI",
    "Toronto": "TOR",
    "Cleveland": "CLE",
    "Boston": "BOS",
    "Milwaukee": "MIL",
    "Kansas City": "KCR",
    "Houston": "HOU",
    "LA Angels": "LAA",
    "San Francisco": "SFG",
    "NY Yankees": "NYY",
    "Colorado": "COL",
    "Cincinnati": "CIN",
    "Washington": "WSN",
    "Atlanta": "ATL",
    "Minnesota": "MIN",
    "St. Louis": "STL",
    "San Diego": "SDP",
    "Seattle": "SEA",
    "Arizona": "ARI",
    "Tampa Bay": "TBR",
    "Chi. White Sox": "CHW",
    "Detroit": "DET",
    "Oakland": "OAK",
    "NY Mets": "NYM",
    "Texas": "TEX",
    "LA Dodgers": "LAD",
    "Athletics": "OAK"  # in case some entries use "Athletics"
}


In [36]:
import os
import pandas as pd

# Path to folder with CSVs
folder = "test_data/daily_odds"

# Store all DataFrames
dfs = []

# Loop through each CSV in the folder
for file in sorted(os.listdir(folder)):
    if file.endswith(".csv"):
        try:
            path = os.path.join(folder, file)
            df = pd.read_csv(path)
            dfs.append(df)
        except Exception as e:
            print(f"❌ Failed to read {file}: {e}")

# Combine all into one DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Save to disk
combined_df.to_csv("all_daily_odds.csv", index=False)
print("✅ Saved to all_daily_odds.csv")


✅ Saved to all_daily_odds.csv


In [37]:
master = pd.read_csv('all_daily_odds.csv')
master['homeTeam_abbr'] = master['homeTeam'].map(team_name_to_abbr)
master['awayTeam_abbr'] = master['awayTeam'].map(team_name_to_abbr)
master

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,bet365_opening_homeOdds,bet365_opening_awayOdds,bet365_current_homeOdds,bet365_current_awayOdds,draftkings_opening_homeOdds,draftkings_opening_awayOdds,draftkings_current_homeOdds,draftkings_current_awayOdds,homeTeam_abbr,awayTeam_abbr
0,341257,2025-03-27T19:05:00+00:00,NY Yankees,Milwaukee,Carlos Rodon,Freddy Peralta,4,2,-168,115,...,-160.0,135.0,-140.0,120.0,-170.0,142.0,-154.0,120.0,NYY,MIL
1,341254,2025-03-27T19:07:00+00:00,Toronto,Baltimore,Jose Berrios,Zach Eflin,2,12,-110,-110,...,-105.0,-115.0,-120.0,100.0,100.0,-120.0,-110.0,-110.0,TOR,BAL
2,341260,2025-03-27T20:05:00+00:00,Texas,Boston,Nathan Eovaldi,Garrett Crochet,2,5,-134,-130,...,-110.0,-110.0,105.0,-125.0,-115.0,-105.0,110.0,-130.0,TEX,BOS
3,341255,2025-03-27T20:05:00+00:00,Washington,Philadelphia,MacKenzie Gore,Zack Wheeler,3,7,150,-190,...,145.0,-170.0,130.0,-155.0,142.0,-170.0,136.0,-162.0,WSN,PHI
4,341262,2025-03-27T20:10:00+00:00,Chi. White Sox,LA Angels,Sean Burke,Yusei Kikuchi,8,1,136,-164,...,125.0,-150.0,135.0,-160.0,124.0,-148.0,175.0,-230.0,CHW,LAA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,341746,2025-05-03T23:05:00+00:00,Texas,Seattle,Patrick Corbin,Luis Castillo,1,2,119,-130,...,110.0,-130.0,110.0,-130.0,114.0,-135.0,100.0,-130.0,TEX,SEA
479,341748,2025-05-03T23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,-115.0,-105.0,100.0,-120.0,-112.0,-108.0,-120.0,-110.0,MIL,CHC
480,341736,2025-05-03T23:15:00+00:00,Baltimore,Kansas City,Tomoyuki Sugano,Kris Bubic,0,4,-107,-103,...,-120.0,100.0,-115.0,-105.0,-120.0,100.0,-115.0,-115.0,BAL,KCR
481,341742,2025-05-03T23:15:00+00:00,Atlanta,LA Dodgers,Spencer Schwellenbach,Roki Sasaki,3,10,-103,-107,...,-105.0,-115.0,-140.0,120.0,-105.0,-115.0,-130.0,100.0,ATL,LAD


In [38]:
import os
import pandas as pd
from datetime import datetime

# Load odds data
df_odds = master

# Create helper columns
df_odds['year'] = pd.to_datetime(df_odds['startDate']).dt.year
df_odds['game_date'] = pd.to_datetime(df_odds['startDate']).dt.strftime("%Y-%m-%d")

# Initialize empty columns
df_odds["homeTeam_win_pct"] = None
df_odds["awayTeam_win_pct"] = None

def get_previous_win_pct(team_abbr, opponent_abbr, date_str, year):
    file_path = f"test_data/season_win_percentage/{year}_{team_abbr}.csv"
    if not os.path.exists(file_path):
        return None
    df_team = pd.read_csv(file_path)
    df_team = df_team.sort_values("date")
    
    # Get the row where the team played that opponent on that date
    idx = df_team[
        (df_team["date"] == date_str) &
        (df_team["team_ID"] == team_abbr) &
        (df_team["opponent"] == opponent_abbr)
    ].index

    if len(idx) == 0 or idx[0] == 0:
        return None  # No match or no previous game
    prev_row = df_team.iloc[idx[0] - 1]
    return prev_row["win_percentage"]

# Loop through each game
for i, row in df_odds.iterrows():
    year = row["year"]
    date = row["game_date"]
    home = row["homeTeam_abbr"]
    away = row["awayTeam_abbr"]

    # Get win percentage from previous game
    home_pct = get_previous_win_pct(home, away, date, year)
    away_pct = get_previous_win_pct(away, home, date, year)

    df_odds.at[i, "homeTeam_win_pct"] = home_pct
    df_odds.at[i, "awayTeam_win_pct"] = away_pct

# Save updated DataFrame
df_odds

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,draftkings_opening_homeOdds,draftkings_opening_awayOdds,draftkings_current_homeOdds,draftkings_current_awayOdds,homeTeam_abbr,awayTeam_abbr,year,game_date,homeTeam_win_pct,awayTeam_win_pct
0,341257,2025-03-27T19:05:00+00:00,NY Yankees,Milwaukee,Carlos Rodon,Freddy Peralta,4,2,-168,115,...,-170.0,142.0,-154.0,120.0,NYY,MIL,2025,2025-03-27,,
1,341254,2025-03-27T19:07:00+00:00,Toronto,Baltimore,Jose Berrios,Zach Eflin,2,12,-110,-110,...,100.0,-120.0,-110.0,-110.0,TOR,BAL,2025,2025-03-27,,
2,341260,2025-03-27T20:05:00+00:00,Texas,Boston,Nathan Eovaldi,Garrett Crochet,2,5,-134,-130,...,-115.0,-105.0,110.0,-130.0,TEX,BOS,2025,2025-03-27,,
3,341255,2025-03-27T20:05:00+00:00,Washington,Philadelphia,MacKenzie Gore,Zack Wheeler,3,7,150,-190,...,142.0,-170.0,136.0,-162.0,WSN,PHI,2025,2025-03-27,,
4,341262,2025-03-27T20:10:00+00:00,Chi. White Sox,LA Angels,Sean Burke,Yusei Kikuchi,8,1,136,-164,...,124.0,-148.0,175.0,-230.0,CHW,LAA,2025,2025-03-27,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,341746,2025-05-03T23:05:00+00:00,Texas,Seattle,Patrick Corbin,Luis Castillo,1,2,119,-130,...,114.0,-135.0,100.0,-130.0,TEX,SEA,2025,2025-05-03,0.485,0.613
479,341748,2025-05-03T23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,-112.0,-108.0,-120.0,-110.0,MIL,CHC,2025,2025-05-03,0.485,0.606
480,341736,2025-05-03T23:15:00+00:00,Baltimore,Kansas City,Tomoyuki Sugano,Kris Bubic,0,4,-107,-103,...,-120.0,100.0,-115.0,-115.0,BAL,KCR,2025,2025-05-03,0.419,0.515
481,341742,2025-05-03T23:15:00+00:00,Atlanta,LA Dodgers,Spencer Schwellenbach,Roki Sasaki,3,10,-103,-107,...,-105.0,-115.0,-130.0,100.0,ATL,LAD,2025,2025-05-03,0.452,0.688


In [39]:
filtered_test_data = master[
    (master['homeTeam'] == 'LA Dodgers') | (master['awayTeam'] == 'LA Dodgers')
].copy()
filtered_test_data

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,draftkings_opening_homeOdds,draftkings_opening_awayOdds,draftkings_current_homeOdds,draftkings_current_awayOdds,homeTeam_abbr,awayTeam_abbr,year,game_date,homeTeam_win_pct,awayTeam_win_pct
11,341264,2025-03-27T23:10:00+00:00,LA Dodgers,Detroit,Blake Snell,Tarik Skubal,5,4,-176,110,...,-162.0,136.0,-175.0,135.0,LAD,DET,2025,2025-03-27,1.0,
22,341271,2025-03-29T02:10:00+00:00,LA Dodgers,Detroit,Yoshinobu Yamamoto,Jack Flaherty,8,5,-198,178,...,-198.0,164.0,-230.0,190.0,LAD,DET,2025,2025-03-29,1.0,0.0
36,341286,2025-03-30T01:10:00+00:00,LA Dodgers,Detroit,Roki Sasaki,Reese Olson,7,3,-200,180,...,-205.0,170.0,-205.0,170.0,LAD,DET,2025,2025-03-30,,
51,341312,2025-04-01T02:10:00+00:00,LA Dodgers,Atlanta,Tyler Glasnow,Grant Holmes,6,1,-210,186,...,-198.0,164.0,-225.0,185.0,LAD,ATL,2025,2025-04-01,1.0,0.0
64,341329,2025-04-02T02:10:00+00:00,LA Dodgers,Atlanta,Dustin May,Chris Sale,3,1,-130,118,...,-130.0,110.0,-130.0,110.0,LAD,ATL,2025,2025-04-02,1.0,0.0
79,341344,2025-04-03T00:38:00+00:00,LA Dodgers,Atlanta,Blake Snell,Bryce Elder,6,5,-230,120,...,-218.0,180.0,-250.0,190.0,LAD,ATL,2025,2025-04-03,,
92,341353,2025-04-04T22:45:00+00:00,Philadelphia,LA Dodgers,Jesus Luzardo,Yoshinobu Yamamoto,3,2,132,-140,...,120.0,-142.0,114.0,-135.0,PHI,LAD,2025,2025-04-04,0.833,1.0
102,341378,2025-04-05T20:05:00+00:00,Philadelphia,LA Dodgers,Aaron Nola,Roki Sasaki,1,3,-106,-120,...,-112.0,-108.0,-108.0,-112.0,PHI,LAD,2025,2025-04-05,0.857,0.889
114,341388,2025-04-06T17:35:00+00:00,Philadelphia,LA Dodgers,Cristopher Sanchez,Tyler Glasnow,8,7,109,-122,...,102.0,-122.0,200.0,-270.0,PHI,LAD,2025,2025-04-06,0.75,0.9
130,341396,2025-04-07T22:45:00+00:00,Washington,LA Dodgers,MacKenzie Gore,Dustin May,6,4,134,-150,...,130.0,-155.0,154.0,-185.0,WSN,LAD,2025,2025-04-07,0.333,0.818


In [40]:
import pandas as pd

# Ensure startDate is a timezone-aware datetime
df_odds['startDate'] = pd.to_datetime(df_odds['startDate'], utc=True)

# Create a timezone-aware timestamp for comparison
cutoff = pd.Timestamp('2025-03-28', tz='UTC')

# Filter out rows with startDate before the cutoff
df_odds = df_odds[df_odds['startDate'] >= cutoff]
df_odds = df_odds[df_odds['homeTeam_win_pct'].notna()]
df_odds = df_odds[df_odds['awayTeam_win_pct'].notna()]
df_odds

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,draftkings_opening_homeOdds,draftkings_opening_awayOdds,draftkings_current_homeOdds,draftkings_current_awayOdds,homeTeam_abbr,awayTeam_abbr,year,game_date,homeTeam_win_pct,awayTeam_win_pct
13,341263,2025-03-28 02:10:00+00:00,Arizona,Chi. Cubs,Zac Gallen,Justin Steele,6,10,-154,130,...,,,,,ARI,CHC,2025,2025-03-28,0.0,0.333
15,341267,2025-03-28 23:07:00+00:00,Toronto,Baltimore,Kevin Gausman,Charlie Morton,8,2,-125,113,...,-130.0,110.0,-130.0,110.0,TOR,BAL,2025,2025-03-28,0.0,1.0
16,341268,2025-03-28 23:10:00+00:00,Miami,Pittsburgh,Connor Gillispie,Mitch Keller,3,4,120,-132,...,110.0,-130.0,124.0,-148.0,MIA,PIT,2025,2025-03-28,1.0,0.0
17,341269,2025-03-29 00:05:00+00:00,Texas,Boston,Jack Leiter,Tanner Houck,4,1,105,-115,...,100.0,-120.0,-105.0,-115.0,TEX,BOS,2025,2025-03-29,0.5,0.5
18,341296,2025-03-29 00:10:00+00:00,Houston,NY Mets,Hunter Brown,Tylor Megill,1,3,-140,123,...,-135.0,114.0,-120.0,100.0,HOU,NYM,2025,2025-03-29,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,341746,2025-05-03 23:05:00+00:00,Texas,Seattle,Patrick Corbin,Luis Castillo,1,2,119,-130,...,114.0,-135.0,100.0,-130.0,TEX,SEA,2025,2025-05-03,0.485,0.613
479,341748,2025-05-03 23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,-112.0,-108.0,-120.0,-110.0,MIL,CHC,2025,2025-05-03,0.485,0.606
480,341736,2025-05-03 23:15:00+00:00,Baltimore,Kansas City,Tomoyuki Sugano,Kris Bubic,0,4,-107,-103,...,-120.0,100.0,-115.0,-115.0,BAL,KCR,2025,2025-05-03,0.419,0.515
481,341742,2025-05-03 23:15:00+00:00,Atlanta,LA Dodgers,Spencer Schwellenbach,Roki Sasaki,3,10,-103,-107,...,-105.0,-115.0,-130.0,100.0,ATL,LAD,2025,2025-05-03,0.452,0.688


In [41]:
import os
import pandas as pd
from datetime import datetime, timedelta

# Add empty columns for batting average
df_odds["homeTeam_batting_avg"] = None
df_odds["awayTeam_batting_avg"] = None

def get_batting_average(team_abbr, date_str):
    batting_file = os.path.join("test_data/batting_data", f"{date_str}.csv")
    if not os.path.exists(batting_file):
        return None
    df_bat = pd.read_csv(batting_file)
    row = df_bat[df_bat["Team Abbreviation"] == team_abbr]
    if row.empty:
        return None
    return float(row.iloc[0]["Batting Average"])

# Process each row
for i, row in df_odds.iterrows():
    try:
        game_date = pd.to_datetime(row["game_date"])
        next_day = (game_date - timedelta(days=1)).strftime("%Y-%m-%d")

        home_abbr = row["homeTeam_abbr"]
        away_abbr = row["awayTeam_abbr"]

        home_avg = get_batting_average(home_abbr, next_day)
        away_avg = get_batting_average(away_abbr, next_day)

        df_odds.at[i, "homeTeam_batting_avg"] = home_avg
        df_odds.at[i, "awayTeam_batting_avg"] = away_avg
    except Exception as e:
        print(f"⚠️ Error on row {i}: {e}")

df_odds = df_odds.dropna(subset=["awayTeam_batting_avg"])

In [42]:
# Normalize pitcher names from gamelog for matching
valid_pitchers = set(p.strip().title() for p in unique_pitchers)

# Normalize home/away pitcher names in df_odds
df_odds['homePitcher_normalized'] = df_odds['homePitcher'].fillna('').str.strip().str.title()
df_odds['awayPitcher_normalized'] = df_odds['awayPitcher'].fillna('').str.strip().str.title()

# Filter rows where both home and away pitcher names exist in gamelog
df_odds_filtered = df_odds[
    df_odds['homePitcher_normalized'].isin(valid_pitchers) &
    df_odds['awayPitcher_normalized'].isin(valid_pitchers)
].copy()

print(f"✅ Filtered odds data: {len(df_odds_filtered)} rows (from original {len(df_odds)})")

✅ Filtered odds data: 429 rows (from original 429)


In [45]:
import pandas as pd
from datetime import timedelta

df_pitchers = pd.read_csv("combined_pitcher_gamelogs.csv")

# Normalize pitcher names
df_odds_filtered['homePitcher_normalized'] = df_odds_filtered['homePitcher'].fillna('').str.strip().str.title()
df_odds_filtered['awayPitcher_normalized'] = df_odds_filtered['awayPitcher'].fillna('').str.strip().str.title()
df_odds_filtered['game_date'] = pd.to_datetime(df_odds_filtered['startDate']).dt.date
df_odds_filtered['game_year'] = pd.to_datetime(df_odds_filtered['startDate']).dt.year

# Normalize pitcher data
df_pitchers['Player'] = df_pitchers['Player'].str.strip().str.title()
df_pitchers['Date'] = pd.to_datetime(df_pitchers['Date'] + ' ' + df_pitchers['Year'].astype(str))

# For logging issues
missing_pitchers = set()
pitchers_with_no_prior_game = set()

# ERA lookup function with tracking
def get_latest_era(pitcher_name, game_date):
    if pitcher_name not in df_pitchers['Player'].values:
        missing_pitchers.add(pitcher_name)
        return None

    games = df_pitchers[df_pitchers['Player'] == pitcher_name]
    games = games[games['Date'].dt.date < (game_date - timedelta(days=0))]  # must be before game date
    if games.empty:
        pitchers_with_no_prior_game.add(pitcher_name)
        return None

    latest_game = games.sort_values('Date', ascending=False).iloc[0]
    return latest_game['ERA']

# Apply ERA lookup
df_odds_filtered['homePitcher_era'] = df_odds_filtered.apply(
    lambda row: get_latest_era(row['homePitcher_normalized'], row['game_date']), axis=1)

df_odds_filtered['awayPitcher_era'] = df_odds_filtered.apply(
    lambda row: get_latest_era(row['awayPitcher_normalized'], row['game_date']), axis=1)

# Show results
print("❌ Pitchers not found in dataset:", sorted(missing_pitchers))
print("⚠️ Pitchers found but no prior games:", sorted(pitchers_with_no_prior_game))


❌ Pitchers not found in dataset: ['Aj Smith-Shawver', 'Brandon Pfaadt', 'Carlos Rodon', 'Cole Ragans', 'Corbin Burnes', 'Cristopher Sanchez', 'Drew Rasmussen', 'Dylan Cease', 'Eduardo Rodriguez', 'German Marquez', 'Huascar Brazoban', 'Jesus Luzardo', 'Joe Boyle', 'Jordan Hicks', 'Jose Berrios', 'Jose Soriano', 'Justin Verlander', 'Kris Bubic', 'Kyle Hart', 'Landen Roupp', 'Logan Webb', 'Luis F. Castillo', 'Luis L. Ortiz', 'Martin Perez', 'Merrill Kelly', 'Michael King', 'Michael Lorenzen', 'Michael Wacha', 'Nick Pivetta', 'Noah Cameron', 'Pablo Lopez', 'Patrick Corbin', 'Paul Skenes', 'Quinn Priester', 'Randy Vasquez', 'Reynaldo Lopez', 'Robbie Ray', 'Roki Sasaki', 'Ronel Blanco', 'Ryan Feltner', 'Ryan Gusto', 'Ryan Pepiot', 'Ryan Yarbrough', 'Sandy Alcantara', 'Scott Blewett', 'Sean Burke', 'Sean Newcomb', 'Seth Lugo', 'Shane Baz', 'Shane Smith', 'Shota Imanaga', 'Simeon Woods Richardson', 'Sonny Gray', 'Spencer Arrighetti', 'Spencer Schwellenbach', 'Spencer Strider', 'Steven Matz', '

There are some missing pitchers that we need to include into the dataset

In [67]:
import os
import re
import time
import unicodedata
import requests
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# -----------------------------
# 1. Missing pitchers (from your earlier list)
pitcher_names = [
    'yariel rodriguez', 'martin perez', 'randy vasquez', 'cristopher sanchez',
    'jose soriano', 'jesus luzardo', 'pablo lopez', 'reynaldo lopez',
    'german marquez', 'jose berrios', 'carlos rodon', 'huascar brazoban'
]

# Normalize name (remove accents, lowercase, hyphenate)
def slugify_name(name):
    no_accents = unicodedata.normalize('NFD', name)
    ascii_name = ''.join(c for c in no_accents if unicodedata.category(c) != 'Mn')
    return ascii_name.lower().replace(" ", "-")

# ----------------------------------------
# 2. Lookup MLB player IDs via API
print("🔍 Looking up player IDs...")
all_players = []
failed_lookups = []

for name in tqdm(pitcher_names, desc="Looking up pitchers"):
    search_name = name.lower().replace(" ", "")
    url = f"https://statsapi.mlb.com/api/v1/people/search?names={search_name}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        players = response.json().get("people", [])
        if players:
            best_match = players[0]  # Grab first result
            best_match["input_name"] = name
            all_players.append(best_match)
        else:
            failed_lookups.append(name)
    except Exception:
        failed_lookups.append(name)

df_players = pd.DataFrame(all_players)

if not df_players.empty:
    print("\n✅ Found player IDs:")
    print(df_players[["id", "fullName", "input_name"]])

if failed_lookups:
    print("\n❌ Failed to find IDs for:")
    for name in failed_lookups:
        print(f" - {name}")

# ----------------------------------------
# 3. Download game log HTMLs with Selenium
output_dir = "test_data/raw_pitcher_data"
os.makedirs(output_dir, exist_ok=True)

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=options)

failed_downloads = []
print("\n🌐 Downloading HTML game logs...")

for _, row in tqdm(df_players.iterrows(), total=len(df_players), desc="Downloading"):
    name = row["input_name"]
    player_id = row["id"]
    year = 2025
    slug = slugify_name(name)

    url = f"https://www.mlb.com/player/{slug}-{player_id}?stats=gamelogs-r-pitching-mlb&year={year}"
    try:
        driver.get(url)
        time.sleep(5)
        html = driver.page_source

        filepath = os.path.join(output_dir, f"{slug}_{year}.html")
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(html)

        print(f"✅ Saved {slug}_{year}.html")
    except Exception as e:
        failed_downloads.append((name, str(e)))
        print(f"❌ Failed for {name}")

driver.quit()

# ----------------------------------------
# 4. Final report
if failed_downloads:
    print("\n⚠️ Some downloads failed:")
    for name, reason in failed_downloads:
        print(f" - {name}: {reason}")
else:
    print("\n🎉 All missing pitchers downloaded successfully!")


🔍 Looking up player IDs...


Looking up pitchers: 100%|██████████| 12/12 [00:05<00:00,  2.27it/s]



✅ Found player IDs:
        id            fullName          input_name
0   684320    Yariel Rodríguez    yariel rodriguez
1   527048        Martín Pérez        martin perez
2   681190       Randy Vásquez       randy vasquez
3   650911  Cristopher Sánchez  cristopher sanchez
4   667755        José Soriano        jose soriano
5   666200       Jesús Luzardo       jesus luzardo
6   641154         Pablo López         pablo lopez
7   625643      Reynaldo López      reynaldo lopez
8   608566      Germán Márquez      german marquez
9   621244        José Berríos        jose berrios
10  607074        Carlos Rodón        carlos rodon
11  623211    Huascar Brazobán    huascar brazoban

🌐 Downloading HTML game logs...


Downloading:   8%|▊         | 1/12 [00:06<01:08,  6.26s/it]

✅ Saved yariel-rodriguez_2025.html


Downloading:  17%|█▋        | 2/12 [00:12<01:04,  6.46s/it]

✅ Saved martin-perez_2025.html


Downloading:  25%|██▌       | 3/12 [00:19<00:57,  6.44s/it]

✅ Saved randy-vasquez_2025.html


Downloading:  33%|███▎      | 4/12 [00:25<00:52,  6.54s/it]

✅ Saved cristopher-sanchez_2025.html


Downloading:  42%|████▏     | 5/12 [00:32<00:46,  6.61s/it]

✅ Saved jose-soriano_2025.html


Downloading:  50%|█████     | 6/12 [00:39<00:40,  6.74s/it]

✅ Saved jesus-luzardo_2025.html


Downloading:  58%|█████▊    | 7/12 [00:46<00:33,  6.60s/it]

✅ Saved pablo-lopez_2025.html


Downloading:  67%|██████▋   | 8/12 [00:52<00:26,  6.72s/it]

✅ Saved reynaldo-lopez_2025.html


Downloading:  75%|███████▌  | 9/12 [01:00<00:20,  6.87s/it]

✅ Saved german-marquez_2025.html


Downloading:  83%|████████▎ | 10/12 [01:07<00:13,  6.95s/it]

✅ Saved jose-berrios_2025.html


Downloading:  92%|█████████▏| 11/12 [01:15<00:07,  7.24s/it]

✅ Saved carlos-rodon_2025.html


Downloading: 100%|██████████| 12/12 [01:23<00:00,  6.97s/it]

✅ Saved huascar-brazoban_2025.html






🎉 All missing pitchers downloaded successfully!


In [65]:
import os
import re
import pandas as pd

def extract_pitcher_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    filename = os.path.basename(file_path)
    name, year = filename.replace(".html", "").rsplit("_", 1)
    player_name = name.replace("-", " ").title()

    # Only match rows with <a> in date (real games, not totals/headers)
    row_pattern = re.compile(
        r'(<tr data-index="\d+">\s*.*?<td.*?data-col="0".*?<a[^>]*>[^<]+</a>.*?</tr>)',
        re.DOTALL
    )
    rows = row_pattern.findall(html)

    def extract_col(row_html, col_index):
        pattern = rf'data-col="{col_index}".*?>(.*?)</td>'
        match = re.search(pattern, row_html, re.DOTALL)
        if not match:
            return None
        content = match.group(1)
        text_match = re.search(r"<a[^>]*>([^<]+)</a>", content) or re.search(r"<span[^>]*>([^<]+)</span>", content)
        return text_match.group(1).strip() if text_match else re.sub('<[^<]+?>', '', content).strip()

    data = []
    for row in rows:
        date = extract_col(row, 0)
        team = extract_col(row, 1)
        opponent = extract_col(row, 2)
        era = extract_col(row, 5)
        if team and re.fullmatch(r"[A-Z]{2,3}", team):
            data.append((date, team, opponent, era, player_name, year))

    return pd.DataFrame(data, columns=["Date", "Team", "Opponent", "ERA", "Player", "Year"])


# ----------------------
# 👤 Pitcher name to test
pitcher_name = "Brandon Pfaadt"
year = "2025"

# Convert to filename
file_name = pitcher_name.lower().replace(" ", "-") + f"_{year}.html"
file_path = os.path.join("test_data", "raw_pitcher_data", file_name)

# Run test
if not os.path.exists(file_path):
    print(f"❌ File not found: {file_path}")
else:
    df = extract_pitcher_data(file_path)
    print("✅ Extracted rows:")
    print(df)


✅ Extracted rows:
      Date Team Opponent   ERA          Player  Year
0    Apr 4   AZ    @ WSH  5.25  Brandon Pfaadt  2025
1    Apr 9   AZ   vs BAL  3.50  Brandon Pfaadt  2025
2   Apr 16   AZ    @ MIA  3.04  Brandon Pfaadt  2025
3   Apr 22   AZ    vs TB  2.73  Brandon Pfaadt  2025
4   Apr 27   AZ   vs ATL  2.78  Brandon Pfaadt  2025
5    May 3   AZ    @ PHI  3.79  Brandon Pfaadt  2025
6   Mar 29   AZ   vs CHC  4.50  Brandon Pfaadt  2025
7    Apr 4   AZ    @ WSH  5.25  Brandon Pfaadt  2025
8    Apr 9   AZ   vs BAL  3.50  Brandon Pfaadt  2025
9   Apr 16   AZ    @ MIA  3.04  Brandon Pfaadt  2025
10  Apr 22   AZ    vs TB  2.73  Brandon Pfaadt  2025
11  Apr 27   AZ   vs ATL  2.78  Brandon Pfaadt  2025
12   May 3   AZ    @ PHI  3.79  Brandon Pfaadt  2025


In [71]:
import os
import re
import pandas as pd
from tqdm import tqdm

def extract_pitcher_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    filename = os.path.basename(file_path)
    name, year = filename.replace(".html", "").rsplit("_", 1)
    player_name = name.replace("-", " ").title()

    # Only match rows with <a> in date (real games, not totals/headers)
    row_pattern = re.compile(
        r'(<tr data-index="\d+">\s*.*?<td.*?data-col="0".*?<a[^>]*>[^<]+</a>.*?</tr>)',
        re.DOTALL
    )
    rows = row_pattern.findall(html)

    def extract_col(row_html, col_index):
        pattern = rf'data-col="{col_index}".*?>(.*?)</td>'
        match = re.search(pattern, row_html, re.DOTALL)
        if not match:
            return None
        content = match.group(1)
        text_match = re.search(r"<a[^>]*>([^<]+)</a>", content) or re.search(r"<span[^>]*>([^<]+)</span>", content)
        return text_match.group(1).strip() if text_match else re.sub('<[^<]+?>', '', content).strip()

    data = []
    for row in rows:
        date = extract_col(row, 0)
        team = extract_col(row, 1)
        opponent = extract_col(row, 2)
        era = extract_col(row, 5)
        if team and re.fullmatch(r"[A-Z]{2,3}", team):
            data.append((date, team, opponent, era, player_name, year))

    return pd.DataFrame(data, columns=["Date", "Team", "Opponent", "ERA", "Player", "Year"])

# ----------------------------
# Your list of pitcher names
pitcher_names = list(missing_pitchers)

# Convert to file names
files_to_use = [name.lower().replace(" ", "-") + "_2025.html" for name in pitcher_names]
# ----------------------------

folder = "test_data/raw_pitcher_data"
all_dfs = []

# Process only specified files
for file in tqdm(files_to_use, desc="Processing pitchers"):
    path = os.path.join(folder, file)
    if not os.path.exists(path):
        print(f"⚠️ File not found: {file}")
        continue
    try:
        df = extract_pitcher_data(path)
        all_dfs.append(df)
    except Exception as e:
        print(f"❌ Failed to process {file}: {e}")

# Combine and save
final_df = pd.concat(all_dfs, ignore_index=True)
final_df.to_csv("combined_pitcher_gamelogs_testing.csv", index=False)
print("✅ Saved to combined_pitcher_gamelogs_testing.csv")


Processing pitchers: 100%|██████████| 82/82 [00:00<00:00, 349.17it/s]

✅ Saved to combined_pitcher_gamelogs_testing.csv





In [99]:
import os
import re
import pandas as pd
from tqdm import tqdm

def extract_pitcher_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    filename = os.path.basename(file_path)
    name, year = filename.replace(".html", "").rsplit("_", 1)
    player_name = name.replace("-", " ").title()

    row_pattern = re.compile(r'<tr data-index="\d+">(.*?)</tr>', re.DOTALL)
    rows = row_pattern.findall(html)

    def extract_col(row_html, col_index):
        if col_index == 0:
            match = re.search(rf'data-col="{col_index}".*?<a[^>]*>([^<]+)</a>', row_html, re.DOTALL)
        else:
            match = re.search(rf'data-col="{col_index}".*?<span>([^<]+)</span>', row_html, re.DOTALL)
        return match.group(1).strip() if match else None

    data = []
    for row in rows:
        date = extract_col(row, 0)
        team = extract_col(row, 1)
        opponent = extract_col(row, 2)
        era = extract_col(row, 5)
        if team and re.fullmatch(r"[A-Z]{3}", team):
            data.append((date, team, opponent, era, player_name, year))

    return pd.DataFrame(data, columns=["Date", "Team", "Opponent", "ERA", "Player", "Year"])

# Folder and file setup
folder = "test_data/raw_pitcher_data"
all_dfs = []

# Progress bar
for file in tqdm(sorted(os.listdir(folder))):
    if file.endswith(".html"):
        path = os.path.join(folder, file)
        try:
            df = extract_pitcher_data(path)
            all_dfs.append(df)
        except Exception as e:
            print(f"❌ Failed to process {file}: {e}")

# Combine and save
final_df = pd.concat(all_dfs, ignore_index=True)
final_df.to_csv("combined_pitcher_gamelogs_main.csv", index=False)
print("✅ Saved to combined_pitcher_gamelogs_main.csv")


100%|██████████| 212/212 [00:00<00:00, 391.94it/s]

✅ Saved to combined_pitcher_gamelogs_main.csv





Maybe this isn't malleable for one each. Let's try both

In [100]:
import os
import re
import pandas as pd
from tqdm import tqdm

def extract_pitcher_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    filename = os.path.basename(file_path)
    name, year = filename.replace(".html", "").rsplit("_", 1)
    player_name = name.replace("-", " ").title()

    # ✅ Match only real game rows: <tr data-index="..."> (ignore class="total", etc.)
    row_pattern = re.compile(
        r'<tr data-index="\d+">(.*?)</tr>',
        re.DOTALL
    )
    rows = row_pattern.findall(html)

    def extract_col(row_html, col_index):
        pattern = rf'data-col="{col_index}".*?>(.*?)</td>'
        match = re.search(pattern, row_html, re.DOTALL)
        if not match:
            return None
        content = match.group(1)
        # Extract content from <span> or fallback to raw text
        text_match = (
            re.search(r"<span[^>]*>([^<]+)</span>", content) or
            re.search(r"<a[^>]*>([^<]+)</a>", content)
        )
        return text_match.group(1).strip() if text_match else re.sub('<[^<]+?>', '', content).strip()

    data = []
    for row in rows:
        date = extract_col(row, 0)
        team = extract_col(row, 1)
        opponent = extract_col(row, 2)
        era = extract_col(row, 5)

        # ✅ Only accept real team codes and valid date
        if team and re.fullmatch(r"[A-Z]{2,3}", team) and date:
            data.append((date, team, opponent, era, player_name, year))

    return pd.DataFrame(data, columns=["Date", "Team", "Opponent", "ERA", "Player", "Year"])

# ----------------------------
# List of pitcher names to process (e.g., from `missing_pitchers`)
pitcher_names = list(missing_pitchers)
files_to_use = [name.lower().replace(" ", "-") + "_2025.html" for name in pitcher_names]
# ----------------------------

folder = "test_data/raw_pitcher_data"
all_dfs = []

for file in tqdm(files_to_use, desc="Processing pitchers"):
    path = os.path.join(folder, file)
    if not os.path.exists(path):
        print(f"⚠️ File not found: {file}")
        continue
    try:
        df = extract_pitcher_data(path)
        all_dfs.append(df)
    except Exception as e:
        print(f"❌ Failed to process {file}: {e}")

final_df = pd.concat(all_dfs, ignore_index=True)
final_df.to_csv("combined_pitcher_gamelogs_additional.csv", index=False)
print("✅ Saved to combined_pitcher_gamelogs_additional.csv")


Processing pitchers: 100%|██████████| 82/82 [00:00<00:00, 521.36it/s]

✅ Saved to combined_pitcher_gamelogs_additional.csv





In [101]:
# Load both CSVs
df1 = pd.read_csv("combined_pitcher_gamelogs_additional.csv")
df2 = pd.read_csv("combined_pitcher_gamelogs_main.csv")

# Combine them (append rows)
combined_df = pd.concat([df1, df2], ignore_index=True)

# Optionally, save to a new file
combined_df.to_csv("combined_pitcher_gamelogs.csv", index=False)


In [84]:
import requests
import pandas as pd
from tqdm import tqdm

# Original pitcher names
names = list(missing_pitchers)
normalized_names = [name.lower().replace(" ", "") for name in names]

# Store results and failures
all_players = []
failed_lookups = []

# Loop with tqdm progress bar
for original_name, normalized_name in tqdm(zip(names, normalized_names), total=len(names), desc="Looking up pitchers"):
    url = f"https://statsapi.mlb.com/api/v1/people/search?names={normalized_name}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        players = data.get("people", [])
        
        if players:
            for player in players:
                player["input_name"] = original_name
                all_players.append(player)
        else:
            failed_lookups.append(original_name)
    except Exception as e:
        failed_lookups.append(original_name)

# Convert results to DataFrame
df_players = pd.DataFrame(all_players)

# Show relevant columns
if not df_players.empty:
    print(df_players[["id", "fullName", "input_name"]])

# Report failures
if failed_lookups:
    print("\n❌ Names that returned no results:")
    for name in failed_lookups:
        print(f" - {name}")

Looking up pitchers: 100%|██████████| 82/82 [00:30<00:00,  2.71it/s]

        id        fullName      input_name
0   608379   Michael Wacha   Michael Wacha
1   641816     Tyler Mahle     Tyler Mahle
2   641793    Zack Littell    Zack Littell
3   671737     Taj Bradley     Taj Bradley
4   682847   Luis L. Ortiz   Luis L. Ortiz
..     ...             ...             ...
80  671212       Joe Boyle       Joe Boyle
81  701542     Will Warren     Will Warren
82  808963     Roki Sasaki     Roki Sasaki
83  642232  Ryan Yarbrough  Ryan Yarbrough
84  656302     Dylan Cease     Dylan Cease

[85 rows x 3 columns]





In [83]:
df_pitchers = pd.read_csv("combined_pitcher_gamelogs.csv")

# Normalize
df_odds_filtered['homePitcher_normalized'] = df_odds_filtered['homePitcher'].fillna('').str.strip().str.title()
df_odds_filtered['awayPitcher_normalized'] = df_odds_filtered['awayPitcher'].fillna('').str.strip().str.title()
df_odds_filtered['game_date'] = pd.to_datetime(df_odds_filtered['startDate']).dt.date
df_odds_filtered['game_year'] = pd.to_datetime(df_odds_filtered['startDate']).dt.year

# Ensure pitcher data is clean
df_pitchers['Player'] = df_pitchers['Player'].str.strip().str.title()
df_pitchers['Date'] = pd.to_datetime(df_pitchers['Date'] + ' ' + df_pitchers['Year'].astype(str))

# ERA lookup function
def get_latest_era(pitcher_name, game_date):
    games = df_pitchers[df_pitchers['Player'] == pitcher_name]
    games = games[games['Date'].dt.date < game_date]
    if games.empty:
        return None
    latest_game = games.sort_values('Date', ascending=False).iloc[0]
    return latest_game['ERA']

# Add ERA columns
df_odds_filtered['homePitcher_era'] = df_odds_filtered.apply(
    lambda row: get_latest_era(row['homePitcher_normalized'], row['game_date']), axis=1)

df_odds_filtered['awayPitcher_era'] = df_odds_filtered.apply(
    lambda row: get_latest_era(row['awayPitcher_normalized'], row['game_date']), axis=1)

df_odds_filtered

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,game_date,homeTeam_win_pct,awayTeam_win_pct,homeTeam_batting_avg,awayTeam_batting_avg,homePitcher_normalized,awayPitcher_normalized,game_year,homePitcher_era,awayPitcher_era
13,341263,2025-03-28 02:10:00+00:00,Arizona,Chi. Cubs,Zac Gallen,Justin Steele,6,10,-154,130,...,2025-03-28,0.0,0.333,0.0,0.171875,Zac Gallen,Justin Steele,2025,,8.00
15,341267,2025-03-28 23:07:00+00:00,Toronto,Baltimore,Kevin Gausman,Charlie Morton,8,2,-125,113,...,2025-03-28,0.0,1.0,0.0,0.0,Kevin Gausman,Charlie Morton,2025,,
16,341268,2025-03-28 23:10:00+00:00,Miami,Pittsburgh,Connor Gillispie,Mitch Keller,3,4,120,-132,...,2025-03-28,1.0,0.0,0.0,0.0,Connor Gillispie,Mitch Keller,2025,,
17,341269,2025-03-29 00:05:00+00:00,Texas,Boston,Jack Leiter,Tanner Houck,4,1,105,-115,...,2025-03-29,0.5,0.5,0.212121,0.181818,Jack Leiter,Tanner Houck,2025,1.80,
18,341296,2025-03-29 00:10:00+00:00,Houston,NY Mets,Hunter Brown,Tylor Megill,1,3,-140,123,...,2025-03-29,0.5,0.5,0.241379,0.2,Hunter Brown,Tylor Megill,2025,3.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,341746,2025-05-03 23:05:00+00:00,Texas,Seattle,Patrick Corbin,Luis Castillo,1,2,119,-130,...,2025-05-03,0.485,0.613,0.230097,0.240557,Patrick Corbin,Luis Castillo,2025,,3.62
479,341748,2025-05-03 23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,2025-05-03,0.485,0.606,0.244403,0.264184,Jose Quintana,Jameson Taillon,2025,1.14,4.01
480,341736,2025-05-03 23:15:00+00:00,Baltimore,Kansas City,Tomoyuki Sugano,Kris Bubic,0,4,-107,-103,...,2025-05-03,0.419,0.515,0.225309,0.234568,Tomoyuki Sugano,Kris Bubic,2025,,2.25
481,341742,2025-05-03 23:15:00+00:00,Atlanta,LA Dodgers,Spencer Schwellenbach,Roki Sasaki,3,10,-103,-107,...,2025-05-03,0.452,0.688,0.24532,0.257722,Spencer Schwellenbach,Roki Sasaki,2025,,


In [None]:
import pandas as pd

# Get all rows with missing ERA
missing_era_rows = df_odds_filtered[
    df_odds_filtered['homePitcher_era'].isna() |
    df_odds_filtered['awayPitcher_era'].isna()
]

# Combine home and away pitcher names from those rows
pitcher_names = pd.concat([
    missing_era_rows[['homePitcher_normalized']].rename(columns={'homePitcher_normalized': 'Pitcher'}),
    missing_era_rows[['awayPitcher_normalized']].rename(columns={'awayPitcher_normalized': 'Pitcher'})
])

# Count occurrences of each pitcher
pitcher_counts = pitcher_names['Pitcher'].value_counts()

# Filter for those with more than 1 row
multiple_missing = pitcher_counts[pitcher_counts > 1]

print(f"\n🔁 Pitchers with more than 1 missing ERA row: {len(multiple_missing)}")
for name, count in multiple_missing.items():
    print(f" - {name}: {count} rows")



🔁 Pitchers with more than 1 missing ERA row: 117
 - Zac Gallen: 7 rows
 - Tanner Houck: 7 rows
 - Trevor Williams: 7 rows
 - Seth Lugo: 7 rows
 - Zack Littell: 7 rows
 - Spencer Schwellenbach: 7 rows
 - Shane Smith: 6 rows
 - Ryan Pepiot: 6 rows
 - Tylor Megill: 6 rows
 - Zack Wheeler: 6 rows
 - Will Warren: 6 rows
 - Taijuan Walker: 6 rows
 - Ronel Blanco: 6 rows
 - Shane Baz: 6 rows
 - Walker Buehler: 6 rows
 - Shota Imanaga: 6 rows
 - Tomoyuki Sugano: 6 rows
 - Robbie Ray: 6 rows
 - Tyler Mahle: 6 rows
 - Tarik Skubal: 6 rows
 - Paul Skenes: 6 rows
 - Luis L. Ortiz: 6 rows
 - Yusei Kikuchi: 6 rows
 - Sandy Alcantara: 5 rows
 - Sean Newcomb: 5 rows
 - Sonny Gray: 5 rows
 - Simeon Woods Richardson: 5 rows
 - Roki Sasaki: 5 rows
 - Tanner Bibee: 5 rows
 - Taj Bradley: 5 rows
 - Nick Pivetta: 5 rows
 - Tyler Glasnow: 5 rows
 - Cal Quantrill: 5 rows
 - Kris Bubic: 5 rows
 - Ben Lively: 4 rows
 - Quinn Priester: 4 rows
 - Jose Soriano: 4 rows
 - Tyler Anderson: 4 rows
 - Tyler Alexander:

In [98]:
import os
import re
import time
import unicodedata
import requests
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# -----------------------------
# 1. Get pitchers with >1 missing ERA row
missing_era_rows = df_odds_filtered[
    df_odds_filtered['homePitcher_era'].isna() |
    df_odds_filtered['awayPitcher_era'].isna()
]

pitcher_names = pd.concat([
    missing_era_rows[['homePitcher_normalized']].rename(columns={'homePitcher_normalized': 'Pitcher'}),
    missing_era_rows[['awayPitcher_normalized']].rename(columns={'awayPitcher_normalized': 'Pitcher'})
])

pitcher_counts = pitcher_names['Pitcher'].value_counts()
multiple_missing = pitcher_counts[pitcher_counts > 2]
print(f"\n🔁 Pitchers with more than 1 missing ERA row: {len(multiple_missing)}")

# Normalize name (remove accents, lowercase, hyphenate)
def slugify_name(name):
    no_accents = unicodedata.normalize('NFD', name)
    ascii_name = ''.join(c for c in no_accents if unicodedata.category(c) != 'Mn')
    return ascii_name.lower().replace(" ", "-")

# ----------------------------------------
# 2. Lookup MLB player IDs via API
print("\n🔍 Looking up player IDs...")
all_players = []
failed_lookups = []

for name in tqdm(multiple_missing.index, desc="Looking up pitchers"):
    search_name = name.lower().replace(" ", "")
    url = f"https://statsapi.mlb.com/api/v1/people/search?names={search_name}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        players = response.json().get("people", [])
        if players:
            best_match = players[0]
            best_match["input_name"] = name
            all_players.append(best_match)
        else:
            failed_lookups.append(name)
    except Exception:
        failed_lookups.append(name)

df_players = pd.DataFrame(all_players)

if not df_players.empty:
    print("\n✅ Found player IDs:")
    print(df_players[["id", "fullName", "input_name"]])

if failed_lookups:
    print("\n❌ Failed to find IDs for:")
    for name in failed_lookups:
        print(f" - {name}")

# ----------------------------------------
# 3. Download game log HTMLs with Selenium
output_dir = "test_data/raw_pitcher_data"
os.makedirs(output_dir, exist_ok=True)

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=options)

failed_downloads = []
print("\n🌐 Downloading HTML game logs...")

for _, row in tqdm(df_players.iterrows(), total=len(df_players), desc="Downloading"):
    name = row["input_name"]
    player_id = row["id"]
    year = 2025
    slug = slugify_name(name)

    url = f"https://www.mlb.com/player/{slug}-{player_id}?stats=gamelogs-r-pitching-mlb&year={year}"
    try:
        driver.get(url)
        time.sleep(5)
        html = driver.page_source

        filepath = os.path.join(output_dir, f"{slug}_{year}.html")
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(html)

        print(f"✅ Saved {slug}_{year}.html")
    except Exception as e:
        failed_downloads.append((name, str(e)))
        print(f"❌ Failed for {name}")

driver.quit()

# ----------------------------------------
# 4. Final report
if failed_downloads:
    print("\n⚠️ Some downloads failed:")
    for name, reason in failed_downloads:
        print(f" - {name}: {reason}")
else:
    print("\n🎉 All missing pitchers downloaded successfully!")



🔁 Pitchers with more than 1 missing ERA row: 72

🔍 Looking up player IDs...


Looking up pitchers: 100%|██████████| 72/72 [00:22<00:00,  3.15it/s]



✅ Found player IDs:
        id         fullName       input_name
0   668678       Zac Gallen       Zac Gallen
1   656557     Tanner Houck     Tanner Houck
2   592866  Trevor Williams  Trevor Williams
3   607625        Seth Lugo        Seth Lugo
4   641793     Zack Littell     Zack Littell
..     ...              ...              ...
67  656876   Drew Rasmussen   Drew Rasmussen
68  694297   Brandon Pfaadt   Brandon Pfaadt
69  670102   Bowden Francis   Bowden Francis
70  676979  Garrett Crochet  Garrett Crochet
71  663554       Casey Mize       Casey Mize

[72 rows x 3 columns]

🌐 Downloading HTML game logs...


Downloading:   1%|▏         | 1/72 [00:07<08:58,  7.59s/it]

✅ Saved zac-gallen_2025.html


Downloading:   3%|▎         | 2/72 [00:14<08:10,  7.01s/it]

✅ Saved tanner-houck_2025.html


Downloading:   4%|▍         | 3/72 [00:21<08:15,  7.18s/it]

✅ Saved trevor-williams_2025.html


Downloading:   6%|▌         | 4/72 [00:28<07:48,  6.89s/it]

✅ Saved seth-lugo_2025.html


Downloading:   7%|▋         | 5/72 [00:35<07:47,  6.98s/it]

✅ Saved zack-littell_2025.html


Downloading:   8%|▊         | 6/72 [00:42<07:42,  7.01s/it]

✅ Saved spencer-schwellenbach_2025.html


Downloading:  10%|▉         | 7/72 [00:48<07:19,  6.76s/it]

✅ Saved shane-smith_2025.html


Downloading:  11%|█         | 8/72 [00:55<07:13,  6.78s/it]

✅ Saved ryan-pepiot_2025.html


Downloading:  12%|█▎        | 9/72 [01:01<07:02,  6.70s/it]

✅ Saved tylor-megill_2025.html


Downloading:  14%|█▍        | 10/72 [01:08<06:51,  6.63s/it]

✅ Saved zack-wheeler_2025.html


Downloading:  15%|█▌        | 11/72 [01:14<06:44,  6.64s/it]

✅ Saved will-warren_2025.html


Downloading:  17%|█▋        | 12/72 [01:22<06:48,  6.81s/it]

✅ Saved taijuan-walker_2025.html


Downloading:  18%|█▊        | 13/72 [01:28<06:27,  6.56s/it]

✅ Saved ronel-blanco_2025.html


Downloading:  19%|█▉        | 14/72 [01:34<06:24,  6.63s/it]

✅ Saved shane-baz_2025.html


Downloading:  21%|██        | 15/72 [01:41<06:13,  6.56s/it]

✅ Saved walker-buehler_2025.html


Downloading:  22%|██▏       | 16/72 [01:47<06:00,  6.44s/it]

✅ Saved shota-imanaga_2025.html


Downloading:  24%|██▎       | 17/72 [01:53<05:53,  6.43s/it]

✅ Saved tomoyuki-sugano_2025.html


Downloading:  25%|██▌       | 18/72 [02:01<06:04,  6.75s/it]

✅ Saved robbie-ray_2025.html


Downloading:  26%|██▋       | 19/72 [02:09<06:13,  7.05s/it]

✅ Saved tyler-mahle_2025.html


Downloading:  28%|██▊       | 20/72 [02:16<06:04,  7.02s/it]

✅ Saved tarik-skubal_2025.html


Downloading:  29%|██▉       | 21/72 [02:22<05:51,  6.90s/it]

✅ Saved paul-skenes_2025.html


Downloading:  31%|███       | 22/72 [02:28<05:22,  6.45s/it]

✅ Saved luis-l.-ortiz_2025.html


Downloading:  32%|███▏      | 23/72 [02:34<05:21,  6.56s/it]

✅ Saved yusei-kikuchi_2025.html


Downloading:  33%|███▎      | 24/72 [02:41<05:13,  6.53s/it]

✅ Saved sandy-alcantara_2025.html


Downloading:  35%|███▍      | 25/72 [02:48<05:13,  6.67s/it]

✅ Saved sean-newcomb_2025.html


Downloading:  36%|███▌      | 26/72 [02:55<05:13,  6.81s/it]

✅ Saved sonny-gray_2025.html


Downloading:  38%|███▊      | 27/72 [03:02<05:07,  6.84s/it]

✅ Saved simeon-woods-richardson_2025.html


Downloading:  39%|███▉      | 28/72 [03:08<04:53,  6.67s/it]

✅ Saved roki-sasaki_2025.html


Downloading:  40%|████      | 29/72 [03:15<04:44,  6.62s/it]

✅ Saved tanner-bibee_2025.html


Downloading:  42%|████▏     | 30/72 [03:22<04:45,  6.79s/it]

✅ Saved taj-bradley_2025.html


Downloading:  43%|████▎     | 31/72 [03:28<04:34,  6.70s/it]

✅ Saved nick-pivetta_2025.html


Downloading:  44%|████▍     | 32/72 [03:35<04:25,  6.63s/it]

✅ Saved tyler-glasnow_2025.html


Downloading:  46%|████▌     | 33/72 [03:42<04:22,  6.73s/it]

✅ Saved cal-quantrill_2025.html


Downloading:  47%|████▋     | 34/72 [03:48<04:12,  6.63s/it]

✅ Saved kris-bubic_2025.html


Downloading:  49%|████▊     | 35/72 [03:55<04:08,  6.73s/it]

✅ Saved ben-lively_2025.html


Downloading:  50%|█████     | 36/72 [04:02<04:01,  6.72s/it]

✅ Saved quinn-priester_2025.html


Downloading:  51%|█████▏    | 37/72 [04:09<03:55,  6.74s/it]

✅ Saved jose-soriano_2025.html


Downloading:  53%|█████▎    | 38/72 [04:15<03:47,  6.70s/it]

✅ Saved tyler-anderson_2025.html


Downloading:  54%|█████▍    | 39/72 [04:23<03:49,  6.96s/it]

✅ Saved tyler-alexander_2025.html


Downloading:  56%|█████▌    | 40/72 [04:29<03:37,  6.80s/it]

✅ Saved chris-paddack_2025.html


Downloading:  57%|█████▋    | 41/72 [04:36<03:30,  6.80s/it]

✅ Saved sean-burke_2025.html


Downloading:  58%|█████▊    | 42/72 [04:43<03:21,  6.70s/it]

✅ Saved ryan-gusto_2025.html


Downloading:  60%|█████▉    | 43/72 [04:49<03:14,  6.69s/it]

✅ Saved antonio-senzatela_2025.html


Downloading:  61%|██████    | 44/72 [04:56<03:09,  6.77s/it]

✅ Saved ryan-feltner_2025.html


Downloading:  62%|██████▎   | 45/72 [05:03<03:05,  6.88s/it]

✅ Saved andrew-heaney_2025.html


Downloading:  64%|██████▍   | 46/72 [05:10<02:55,  6.75s/it]

✅ Saved randy-vasquez_2025.html


Downloading:  65%|██████▌   | 47/72 [05:17<02:54,  6.97s/it]

✅ Saved kevin-gausman_2025.html


Downloading:  67%|██████▋   | 48/72 [05:24<02:45,  6.92s/it]

✅ Saved michael-lorenzen_2025.html


Downloading:  68%|██████▊   | 49/72 [05:30<02:34,  6.73s/it]

✅ Saved kodai-senga_2025.html


Downloading:  69%|██████▉   | 50/72 [05:38<02:34,  7.01s/it]

✅ Saved carlos-carrasco_2025.html


Downloading:  71%|███████   | 51/72 [05:45<02:28,  7.06s/it]

✅ Saved dean-kremer_2025.html


Downloading:  72%|███████▏  | 52/72 [05:53<02:23,  7.17s/it]

✅ Saved brad-lord_2025.html


Downloading:  74%|███████▎  | 53/72 [06:00<02:17,  7.23s/it]

✅ Saved chris-sale_2025.html


Downloading:  75%|███████▌  | 54/72 [06:08<02:13,  7.42s/it]

✅ Saved chris-bassitt_2025.html


Downloading:  76%|███████▋  | 55/72 [06:14<02:02,  7.20s/it]

✅ Saved erick-fedde_2025.html


Downloading:  78%|███████▊  | 56/72 [06:22<01:56,  7.26s/it]

✅ Saved max-fried_2025.html


Downloading:  79%|███████▉  | 57/72 [06:30<01:50,  7.37s/it]

✅ Saved german-marquez_2025.html


Downloading:  81%|████████  | 58/72 [06:37<01:44,  7.47s/it]

✅ Saved max-meyer_2025.html


Downloading:  82%|████████▏ | 59/72 [06:44<01:33,  7.18s/it]

✅ Saved jack-flaherty_2025.html


Downloading:  83%|████████▎ | 60/72 [06:50<01:24,  7.03s/it]

✅ Saved aj-smith-shawver_2025.html


Downloading:  85%|████████▍ | 61/72 [06:57<01:15,  6.89s/it]

✅ Saved patrick-corbin_2025.html


Downloading:  86%|████████▌ | 62/72 [07:04<01:08,  6.83s/it]

✅ Saved justin-verlander_2025.html


Downloading:  88%|████████▊ | 63/72 [07:10<01:01,  6.82s/it]

✅ Saved ben-brown_2025.html


Downloading:  89%|████████▉ | 64/72 [07:17<00:53,  6.72s/it]

✅ Saved carlos-rodon_2025.html


Downloading:  90%|█████████ | 65/72 [07:24<00:47,  6.85s/it]

✅ Saved aaron-nola_2025.html


Downloading:  92%|█████████▏| 66/72 [07:31<00:40,  6.73s/it]

✅ Saved landen-roupp_2025.html


Downloading:  93%|█████████▎| 67/72 [07:37<00:33,  6.77s/it]

✅ Saved colin-rea_2025.html


Downloading:  94%|█████████▍| 68/72 [07:44<00:27,  6.86s/it]

✅ Saved drew-rasmussen_2025.html


Downloading:  96%|█████████▌| 69/72 [07:51<00:20,  6.80s/it]

✅ Saved brandon-pfaadt_2025.html


Downloading:  97%|█████████▋| 70/72 [07:58<00:13,  6.78s/it]

✅ Saved bowden-francis_2025.html


Downloading:  99%|█████████▊| 71/72 [08:05<00:06,  6.86s/it]

✅ Saved garrett-crochet_2025.html


Downloading: 100%|██████████| 72/72 [08:12<00:00,  6.83s/it]

✅ Saved casey-mize_2025.html






🎉 All missing pitchers downloaded successfully!


In [18]:
df_odds_filtered = df_odds_filtered.dropna(subset=['homePitcher_era', 'awayPitcher_era'])
df_odds_filtered

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,game_date,homeTeam_win_pct,awayTeam_win_pct,homeTeam_batting_avg,awayTeam_batting_avg,homePitcher_normalized,awayPitcher_normalized,game_year,homePitcher_era,awayPitcher_era
22,341271,2025-03-29 02:10:00+00:00,LA Dodgers,Detroit,Yoshinobu Yamamoto,Jack Flaherty,8,5,-198,178,...,2025-03-29,1.0,0.0,0.221053,0.264706,Yoshinobu Yamamoto,Jack Flaherty,2025,2.70,3.18
49,341313,2025-04-01 01:40:00+00:00,Seattle,Detroit,Emerson Hancock,Jackson Jobe,6,9,-117,106,...,2025-04-01,0.4,0.333,0.175439,0.238095,Emerson Hancock,Jackson Jobe,2025,81.00,6.75
61,341328,2025-04-02 01:40:00+00:00,Seattle,Detroit,Logan Gilbert,Casey Mize,1,4,-158,142,...,2025-04-02,0.333,0.5,0.187919,0.292517,Logan Gilbert,Casey Mize,2025,3.00,0.00
65,341332,2025-04-02 16:40:00+00:00,Cincinnati,Texas,Hunter Greene,Jack Leiter,0,1,-124,111,...,2025-04-02,0.4,0.667,0.263158,0.222222,Hunter Greene,Jack Leiter,2025,3.60,1.80
75,341335,2025-04-02 20:40:00+00:00,Miami,NY Mets,Connor Gillispie,Clay Holmes,5,6,170,-194,...,2025-04-02,0.333,0.4,0.252941,0.185484,Connor Gillispie,Clay Holmes,2025,5.40,3.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,341747,2025-05-03 18:10:00+00:00,Chi. White Sox,Houston,Davis Martin,Hunter Brown,3,8,205,-235,...,2025-05-03,0.296,0.533,0.213294,0.235294,Davis Martin,Hunter Brown,2025,3.24,1.22
471,341739,2025-05-03 19:07:00+00:00,Toronto,Cleveland,Kevin Gausman,Gavin Williams,3,5,-129,117,...,2025-05-03,0.517,0.517,0.237864,0.232767,Kevin Gausman,Gavin Williams,2025,4.50,5.14
475,341740,2025-05-03 20:10:00+00:00,Boston,Minnesota,Hunter Dobbins,Bailey Ober,3,4,-112,101,...,2025-05-03,0.517,0.4,0.250219,0.23535,Hunter Dobbins,Bailey Ober,2025,2.45,4.13
479,341748,2025-05-03 23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,2025-05-03,0.5,0.594,0.244403,0.264184,Jose Quintana,Jameson Taillon,2025,1.14,4.01


In [19]:
import numpy as np
import pandas as pd

# Convert relevant columns to numeric
df_odds_filtered["homeTeam_win_pct"] = pd.to_numeric(df_odds_filtered["homeTeam_win_pct"], errors="coerce")
df_odds_filtered["awayTeam_win_pct"] = pd.to_numeric(df_odds_filtered["awayTeam_win_pct"], errors="coerce")
df_odds_filtered["homeTeam_batting_avg"] = pd.to_numeric(df_odds_filtered["homeTeam_batting_avg"], errors="coerce")
df_odds_filtered["awayTeam_batting_avg"] = pd.to_numeric(df_odds_filtered["awayTeam_batting_avg"], errors="coerce")
df_odds_filtered["homePitcher_era"] = pd.to_numeric(df_odds_filtered["homePitcher_era"], errors="coerce")
df_odds_filtered["awayPitcher_era"] = pd.to_numeric(df_odds_filtered["awayPitcher_era"], errors="coerce")

# Compute alpha, beta, gamma
df_odds_filtered["alpha"] = df_odds_filtered["homeTeam_win_pct"] / df_odds_filtered["awayTeam_win_pct"]
df_odds_filtered["beta"] = df_odds_filtered["homeTeam_batting_avg"] / df_odds_filtered["awayTeam_batting_avg"]
df_odds_filtered["gamma"] = df_odds_filtered["awayPitcher_era"] / df_odds_filtered["homePitcher_era"]

df_odds_filtered


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds_filtered["homeTeam_win_pct"] = pd.to_numeric(df_odds_filtered["homeTeam_win_pct"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds_filtered["awayTeam_win_pct"] = pd.to_numeric(df_odds_filtered["awayTeam_win_pct"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,homeTeam_batting_avg,awayTeam_batting_avg,homePitcher_normalized,awayPitcher_normalized,game_year,homePitcher_era,awayPitcher_era,alpha,beta,gamma
22,341271,2025-03-29 02:10:00+00:00,LA Dodgers,Detroit,Yoshinobu Yamamoto,Jack Flaherty,8,5,-198,178,...,0.221053,0.264706,Yoshinobu Yamamoto,Jack Flaherty,2025,2.70,3.18,inf,0.835089,1.177778
49,341313,2025-04-01 01:40:00+00:00,Seattle,Detroit,Emerson Hancock,Jackson Jobe,6,9,-117,106,...,0.175439,0.238095,Emerson Hancock,Jackson Jobe,2025,81.00,6.75,1.201201,0.736845,0.083333
61,341328,2025-04-02 01:40:00+00:00,Seattle,Detroit,Logan Gilbert,Casey Mize,1,4,-158,142,...,0.187919,0.292517,Logan Gilbert,Casey Mize,2025,3.00,0.00,0.666000,0.642421,0.000000
65,341332,2025-04-02 16:40:00+00:00,Cincinnati,Texas,Hunter Greene,Jack Leiter,0,1,-124,111,...,0.263158,0.222222,Hunter Greene,Jack Leiter,2025,3.60,1.80,0.599700,1.184212,0.500000
75,341335,2025-04-02 20:40:00+00:00,Miami,NY Mets,Connor Gillispie,Clay Holmes,5,6,170,-194,...,0.252941,0.185484,Connor Gillispie,Clay Holmes,2025,5.40,3.86,0.832500,1.363681,0.714815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,341747,2025-05-03 18:10:00+00:00,Chi. White Sox,Houston,Davis Martin,Hunter Brown,3,8,205,-235,...,0.213294,0.235294,Davis Martin,Hunter Brown,2025,3.24,1.22,0.555347,0.906500,0.376543
471,341739,2025-05-03 19:07:00+00:00,Toronto,Cleveland,Kevin Gausman,Gavin Williams,3,5,-129,117,...,0.237864,0.232767,Kevin Gausman,Gavin Williams,2025,4.50,5.14,1.000000,1.021897,1.142222
475,341740,2025-05-03 20:10:00+00:00,Boston,Minnesota,Hunter Dobbins,Bailey Ober,3,4,-112,101,...,0.250219,0.235350,Hunter Dobbins,Bailey Ober,2025,2.45,4.13,1.292500,1.063178,1.685714
479,341748,2025-05-03 23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,0.244403,0.264184,Jose Quintana,Jameson Taillon,2025,1.14,4.01,0.841751,0.925124,3.517544


In [20]:
# Ensure numeric
df_odds_filtered["homeScore"] = pd.to_numeric(df_odds_filtered["homeScore"], errors="coerce")
df_odds_filtered["awayScore"] = pd.to_numeric(df_odds_filtered["awayScore"], errors="coerce")

# Compute binary outcome
df_odds_filtered["X"] = (df_odds_filtered["homeScore"] > df_odds_filtered["awayScore"]).astype(int)
df_odds_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds_filtered["homeScore"] = pd.to_numeric(df_odds_filtered["homeScore"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds_filtered["awayScore"] = pd.to_numeric(df_odds_filtered["awayScore"], errors="coerce")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds_filtered[

Unnamed: 0,gameId,startDate,homeTeam,awayTeam,homePitcher,awayPitcher,homeScore,awayScore,prophetx_opening_homeOdds,prophetx_opening_awayOdds,...,awayTeam_batting_avg,homePitcher_normalized,awayPitcher_normalized,game_year,homePitcher_era,awayPitcher_era,alpha,beta,gamma,X
22,341271,2025-03-29 02:10:00+00:00,LA Dodgers,Detroit,Yoshinobu Yamamoto,Jack Flaherty,8,5,-198,178,...,0.264706,Yoshinobu Yamamoto,Jack Flaherty,2025,2.70,3.18,inf,0.835089,1.177778,1
49,341313,2025-04-01 01:40:00+00:00,Seattle,Detroit,Emerson Hancock,Jackson Jobe,6,9,-117,106,...,0.238095,Emerson Hancock,Jackson Jobe,2025,81.00,6.75,1.201201,0.736845,0.083333,0
61,341328,2025-04-02 01:40:00+00:00,Seattle,Detroit,Logan Gilbert,Casey Mize,1,4,-158,142,...,0.292517,Logan Gilbert,Casey Mize,2025,3.00,0.00,0.666000,0.642421,0.000000,0
65,341332,2025-04-02 16:40:00+00:00,Cincinnati,Texas,Hunter Greene,Jack Leiter,0,1,-124,111,...,0.222222,Hunter Greene,Jack Leiter,2025,3.60,1.80,0.599700,1.184212,0.500000,0
75,341335,2025-04-02 20:40:00+00:00,Miami,NY Mets,Connor Gillispie,Clay Holmes,5,6,170,-194,...,0.185484,Connor Gillispie,Clay Holmes,2025,5.40,3.86,0.832500,1.363681,0.714815,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,341747,2025-05-03 18:10:00+00:00,Chi. White Sox,Houston,Davis Martin,Hunter Brown,3,8,205,-235,...,0.235294,Davis Martin,Hunter Brown,2025,3.24,1.22,0.555347,0.906500,0.376543,0
471,341739,2025-05-03 19:07:00+00:00,Toronto,Cleveland,Kevin Gausman,Gavin Williams,3,5,-129,117,...,0.232767,Kevin Gausman,Gavin Williams,2025,4.50,5.14,1.000000,1.021897,1.142222,0
475,341740,2025-05-03 20:10:00+00:00,Boston,Minnesota,Hunter Dobbins,Bailey Ober,3,4,-112,101,...,0.235350,Hunter Dobbins,Bailey Ober,2025,2.45,4.13,1.292500,1.063178,1.685714,0
479,341748,2025-05-03 23:10:00+00:00,Milwaukee,Chi. Cubs,Jose Quintana,Jameson Taillon,2,6,-105,-105,...,0.264184,Jose Quintana,Jameson Taillon,2025,1.14,4.01,0.841751,0.925124,3.517544,0


In [21]:
df_odds_filtered.to_csv("master_test.csv", index=False)
