In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import os

In [9]:
proxy_service_key = os.environ.get('SCRAPE_PROXY_KEY')

In [10]:
# Configuration
teams_year_ranges = {
    'NYY': (1920, 2024),  # Yankees data from 1920 to present
    'LAD': (1958, 2024),  # LA Dodgers from 1958 onwards
    'BRO': (1920, 1957)   # Brooklyn Dodgers until 1957
}

In [11]:
HEADERS = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
}

In [12]:
team_dfs_list = []

for team, (start_year, end_year) in teams_year_ranges.items():
    for year in range(start_year, end_year + 1):
        print(f"Processing {team} - {year}")
        url = f"https://www.baseball-reference.com/teams/{team}/{year}-schedule-scores.shtml"
        try:
            response = requests.get(
                'https://proxy.scrapeops.io/v1/',
                params={
                    'api_key': proxy_service_key,
                    'url': url,
                    'premium': 'true'
                },
                headers=HEADERS
            )

            if response.status_code != 200:
                print(f"Failed to retrieve data for {team} in {year} (Status Code: {response.status_code})")
                continue

            soup = BeautifulSoup(response.content, 'html.parser')

            try:
                src = pd.read_html(StringIO(str(soup)))[0]
            except ValueError as e:
                print(f"No table found for {team} in {year}. Error: {e}")
                continue

            # Additional data processing (as per previous script)
            src = (src.query("Tm != 'Tm' and Inn != 'Game Preview, and Matchups'")
                      .drop(["Unnamed: 2", "Streak", "Orig. Scheduled"], axis=1, errors='ignore')
                      .rename(columns={"Unnamed: 4": "home_away"})
                      .assign(season=year, team=team))

            # Standardize column names
            src.columns = src.columns.str.lower().str.replace("/", "_").str.replace("-", "_")

            # Rename columns for consistency
            src.columns = [
                "gm", "date", "tm", "home_away", "opp", "result", "r", "ra", "inn", 
                "record", "rank", "gb", "win", "loss", "save", "time", "day_night", 
                "attendance", "cli", "year", "team"
            ]

            # Proceed with cleaning and transformations (same as before)
            src["gm"] = pd.to_numeric(src["gm"], errors='coerce')
            src["year"] = src["year"].astype(str)

            src[["weekday", "date"]] = src["date"].str.split(", ", expand=True)
            src["date"] = src["date"].str.replace(" (1)", "", regex=False).str.replace(" (2)", "", regex=False)
            src["game_date"] = pd.to_datetime(src["date"] + ", " + src["year"], format="%b %d, %Y", errors='coerce')

            src["home_away"] = src["home_away"].replace({"@": "away", None: "home"})
            src["gb"] = src["gb"].str.replace("up ", "up").str.replace("up", "+").str.replace("Tied", "0")
            src["gb"] = pd.to_numeric(src["gb"].str.replace("up", "", regex=False), errors='coerce').fillna(0)

            src["attendance"] = pd.to_numeric(src["attendance"], errors='coerce').fillna(0).astype(int)
            src[["r", "ra"]] = src[["r", "ra"]].apply(pd.to_numeric, errors='coerce')

            src["time"] = src["time"].fillna("0:00") + ":00"
            src["time_minutes"] = pd.to_timedelta(src["time"]).dt.total_seconds() / 60
            src["time_minutes"] = src["time_minutes"].fillna(0).astype(int)

            src[['wins', 'losses']] = src['record'].str.split('-', expand=True).fillna(0).astype(int)
            src['win_pct'] = (src['wins'] / src['gm']).round(2)
            src['game_day'] = src['game_date'].dt.day_name()
            src["result"] = src["result"].str.split("-", expand=True)[0]

            src_df = src[[
                "gm", "game_date", "home_away", "opp", "result", "r", "ra", "record", 
                "rank", "gb", "time", "time_minutes", "day_night", "attendance", 
                "year", "wins", "losses", "win_pct", "game_day", "team"
            ]].copy()

            # Only add if valid data is processed
            if not src_df.empty:
                team_dfs_list.append(src_df)
            else:
                print(f"Empty dataframe for {team} in {year}. Skipping.")
        
        except Exception as e:
            print(f"Error processing {team} in {year}: {e}")

Processing NYY - 1920
Processing NYY - 1921
Processing NYY - 1922
Processing NYY - 1923
Processing NYY - 1924
Processing NYY - 1925
Processing NYY - 1926
Processing NYY - 1927
Processing NYY - 1928
Processing NYY - 1929
Processing NYY - 1930
Processing NYY - 1931
Processing NYY - 1932
Processing NYY - 1933
Processing NYY - 1934
Processing NYY - 1935
Processing NYY - 1936
Processing NYY - 1937
Processing NYY - 1938
Processing NYY - 1939
Processing NYY - 1940
Processing NYY - 1941
Processing NYY - 1942
Processing NYY - 1943
Processing NYY - 1944
Processing NYY - 1945
Processing NYY - 1946
Processing NYY - 1947
Processing NYY - 1948
Processing NYY - 1949
Processing NYY - 1950
Processing NYY - 1951
Processing NYY - 1952
Processing NYY - 1953
Processing NYY - 1954
Processing NYY - 1955
Processing NYY - 1956
Processing NYY - 1957
Processing NYY - 1958
Processing NYY - 1959
Processing NYY - 1960
Processing NYY - 1961
Processing NYY - 1962
Processing NYY - 1963
Processing NYY - 1964
Processing

In [13]:
# Combine all valid dataframes
if team_dfs_list:
    all_games_df = pd.concat(team_dfs_list, ignore_index=True)
    print("Data collection complete!")
else:
    print("No data collected.")

Data collection complete!


In [14]:
all_games_df.to_json('../data/standings/dodgers_yankees_history_1924_2024.json', indent=4, orient='records')
all_games_df.to_csv('../data/standings/dodgers_yankees_history_1924_2024.csv', index=False)
all_games_df.to_parquet('../data/standings/dodgers_yankees_history_1924_2024.parquet', index=False)