In [None]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats, pitching_stats, team_batting, team_pitching, playerid_lookup, schedule_and_record, standings
from google.cloud import storage

# Set up GCP credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/suhholee/baseball-ml-gcp-credentials.json"

In [None]:
# Initialize GCP clients
storage_client = storage.Client()

In [11]:
# Create GCP bucket
bucket_name = "baseball-ml-data"
bucket = storage_client.create_bucket(bucket_name)

In [None]:
def fetch_and_save_data(data_function, years, filename_prefix):
    """Fetch data for multiple years and save to CSV"""
    all_data = []
    
    for year in years:
        print(f"Fetching {filename_prefix} data for {year}...")
        try:
            year_data = data_function(year)
            year_data['season'] = year  # Add year column for tracking
            all_data.append(year_data)
            print(f"Retrieved {len(year_data)} records")
        except Exception as e:
            print(f"Error fetching data for {year}: {e}")
    
    # Combine all years
    if all_data:
        combined_data = pd.concat(all_data, ignore_index=True)
        
        # Save locally
        local_path = f"../data/raw/{filename_prefix}_{min(years)}-{max(years)}.csv"
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        combined_data.to_csv(local_path, index=False)
        print(f"Saved combined data to {local_path}")
        
        # Upload to GCS
        blob = bucket.blob(f"raw/{filename_prefix}_{min(years)}-{max(years)}.csv")
        blob.upload_from_filename(local_path)
        print(f"Uploaded to GCS: gs://{bucket_name}/{blob.name}")
        
        return combined_data
    else:
        print("No data collected.")
        return None

In [None]:
# Collect data
years = list(range(2016, 2025))
batting_data = fetch_and_save_data(batting_stats, years, "batting")
pitching_data = fetch_and_save_data(pitching_stats, years, "pitching")
team_batting_data = fetch_and_save_data(team_batting, years, "team_batting")
team_pitching_data = fetch_and_save_data(team_pitching, years, "team_pitching")

Fetching batting data for 2016...
Retrieved 146 records
Fetching batting data for 2017...
Retrieved 144 records
Fetching batting data for 2018...
Retrieved 141 records
Fetching batting data for 2019...
Retrieved 135 records
Fetching batting data for 2020...
Retrieved 142 records
Fetching batting data for 2021...
Retrieved 132 records
Fetching batting data for 2022...
Retrieved 130 records
Fetching batting data for 2023...
Retrieved 134 records
Fetching batting data for 2024...
Retrieved 129 records
Saved combined data to data/raw/batting_2016-2024.csv
Uploaded to GCS: gs://baseball-ml-data/raw/batting_2016-2024.csv
Fetching pitching data for 2016...
Retrieved 74 records
Fetching pitching data for 2017...
Retrieved 58 records
Fetching pitching data for 2018...
Retrieved 58 records
Fetching pitching data for 2019...
Retrieved 61 records
Fetching pitching data for 2020...
Retrieved 40 records
Fetching pitching data for 2021...
Retrieved 39 records
Fetching pitching data for 2022...
Retrie

In [53]:
if batting_data is not None:
    print("\nTop 10 batting averages (2024):")
    batting_2024 = batting_data[batting_data['season'] == 2024]
    qualified = batting_2024[batting_2024['AB'] >= 300]
    print(qualified.sort_values('AVG', ascending=False)[['Name', 'Team', 'AVG', 'HR', 'RBI']].head(10))


Top 10 batting averages (2024):
                       Name   Team    AVG  HR  RBI
1105         Bobby Witt Jr.    KCR  0.332  32  109
1115  Vladimir Guerrero Jr.    TOR  0.323  30  103
1104            Aaron Judge    NYY  0.322  58  144
1213            Luis Arraez  - - -  0.314   4   46
1106          Shohei Ohtani    LAD  0.310  54  130
1118         Yordan Alvarez    HOU  0.308  35   86
1123          Marcell Ozuna    ATL  0.302  39  104
1167            Yainer Diaz    HOU  0.299  16   84
1140            Jose Altuve    HOU  0.295  20   65
1138            Trea Turner    PHI  0.295  21   62
