# LA Dodgers cumulative pitching statistics by season, 1958-2024
> This notebook visusalizes current and past game-by-game and cumulative totals for strikeouts, walks, ERA, etc. using data from [Baseball Reference](https://www.baseball-reference.com/teams/tgl.cgi?team=LAD&t=p&year=2024).

---

In [8]:
#!/usr/bin/env python
# coding: utf-8

import os
import requests
import datetime
import pandas as pd
import jupyter_black
from io import BytesIO
import boto3
import logging
from tqdm.notebook import tqdm
from random import randint
from time import sleep
from IPython.display import Image

In [15]:
# Function to fetch data through proxy
def fetch_data_via_proxy(url):
    try:
        response = requests.get(
            'https://proxy.scrapeops.io/v1/',
            params={
                'api_key': my_api_key,
                'url': url,
            },
            headers=headers
        )
        response.raise_for_status()
        return response.content
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [None]:
# Proxy service API key
my_api_key = os.environ.get('SCRAPE_PROXY_KEY')

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
}

In [17]:
# DataFrame list to store each year's data
dfs = []

# Loop through each year to scrape the data
for year in tqdm(range(1958, 2024)):
    url = f"https://www.baseball-reference.com/teams/tgl.cgi?team=LAD&t=p&year={year}"
    html_content = fetch_data_via_proxy(url)
    if html_content:
        log = pd.read_html(html_content)[1].assign(year=year)
        log.columns = log.columns.str.lower()
        dfs.append(log)
        sleep(randint(1, 5))  # Random sleep to avoid detection

  0%|          | 0/66 [00:00<?, ?it/s]

In [39]:
# Combine all the dataframes into one
archive_src = pd.concat(dfs, ignore_index=True).query('so != "SO"')

In [41]:
# Process current game logs
archive_src["game_date"] = pd.to_datetime(
    archive_src["date"] + " " + archive_src["year"].astype(str),
    format="%b %d %Y",
    errors="coerce"
).dt.strftime("%Y-%m-%d")

In [64]:
# Just the columns we need
keep_cols = ['gtm', 'year', 'game_date', 'h', 'hr', 'er', 'so', 'era']
archive_df = archive_src[keep_cols].copy()

In [65]:
archive_df.tail()

Unnamed: 0,gtm,year,game_date,h,hr,er,so,era
10760,158,2023,2023-09-27,7,2,2,13,4.03
10761,159,2023,2023-09-28,18,3,14,5,4.09
10762,160,2023,2023-09-29,2,1,2,7,4.08
10763,161,2023,2023-09-30,5,1,2,8,4.07
10764,162,2023,2023-10-01,3,2,2,9,4.06


In [71]:
# Define value columns
int_cols = ["gtm", 'h', 'hr', 'er', 'so']

# Convert value columns to numbers
archive_df[int_cols] = archive_df[int_cols].astype(int)
archive_df['era'] = archive_df['era'].astype(float)
archive_df['era_cum'] = archive_df['era']

In [72]:
archive_df.columns

Index(['gtm', 'year', 'game_date', 'h', 'hr', 'er', 'so', 'era', 'era_cum',
       'h_cum', 'hr_cum', 'er_cum', 'so_cum'],
      dtype='object')

In [73]:
# Calculate cumulative columns
for col in ['h', 'hr', 'er', 'so']:
    archive_df[f"{col}_cum"] = archive_df.groupby("year")[col].cumsum()

In [74]:
archive_df.head()

Unnamed: 0,gtm,year,game_date,h,hr,er,so,era,era_cum,h_cum,hr_cum,er_cum,so_cum
0,1,1958,1958-04-15,11,2,8,2,9.0,9.0,11,2,8,2
1,2,1958,1958-04-16,5,0,1,11,4.76,4.76,16,2,9,13
2,3,1958,1958-04-17,10,1,4,5,4.68,4.68,26,3,13,18
3,4,1958,1958-04-18,12,2,5,7,4.76,4.76,38,5,18,25
4,5,1958,1958-04-19,15,2,10,4,5.86,5.86,53,7,28,29


In [75]:
archive_df.dtypes

gtm            int64
year           int64
game_date     object
h              int64
hr             int64
er             int64
so             int64
era          float64
era_cum      float64
h_cum          int64
hr_cum         int64
er_cum         int64
so_cum         int64
dtype: object

In [None]:
# TO DO: 

# Add archive to S3
# Write code to collect current year
# Combine archive and current
# Output combined to S3
# Chart hits, strikeouts side by side
# Chart wide era_cum this year vs. past

In [78]:
# archive_df.to_csv('../data/pitching/dodgers_historic_pitching_gamelogs.csv', index=False)
# archive_df.to_parquet('../data/pitching/dodgers_historic_pitching_gamelogs.parquet', index=False)
# archive_df.to_json('../data/pitching/dodgers_historic_pitching_gamelogs.json', indent=4, lines=False, orient='records')

In [None]:
# Combine current and archive data
df = (
    pd.concat([current_df, archive_df])
    .sort_values(["year", "gtm"], ascending=[False, True])
    .reset_index(drop=True)
    .drop_duplicates()
)

# Optimize DataFrame for output
optimized_df = df[
    ["gtm", "year", "r_cum", "h_cum", "2b_cum", "bb_cum", "so_cum", "hr_cum"]
].copy()

Unnamed: 0,gtm,year,date,h,hr,er,so
0,1,1958,Apr 15,11,2,8,2
1,2,1958,Apr 16,5,0,1,11
2,3,1958,Apr 17,10,1,4,5
3,4,1958,Apr 18,12,2,5,7
4,5,1958,Apr 19,15,2,10,4
...,...,...,...,...,...,...,...
10760,158,2023,Sep 27,7,2,2,13
10761,159,2023,Sep 28,18,3,14,5
10762,160,2023,Sep 29,2,1,2,7
10763,161,2023,Sep 30,5,1,2,8


In [None]:
# Save to CSV or any other format if needed
combined_df.to_csv('dodgers_historic_pitching_gamelogs.csv', index=False)

In [None]:
# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Determine if running in a GitHub Actions environment
is_github_actions = os.getenv('GITHUB_ACTIONS') == 'true'

# AWS credentials and session initialization
aws_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
aws_secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
aws_region = "us-west-1"

# Conditional AWS session creation based on the environment
if is_github_actions:
    # In GitHub Actions, use environment variables directly
    session = boto3.Session(
        aws_access_key_id=aws_key_id,
        aws_secret_access_key=aws_secret_key,
        region_name=aws_region
    )
else:
    # Locally, use a specific profile
    session = boto3.Session(profile_name="haekeo", region_name=aws_region)

s3_resource = session.resource("s3")

# Base directory settings
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'data', 'batting')
# os.makedirs(data_dir, exist_ok=True)

profile_name = os.environ.get("AWS_PERSONAL_PROFILE")
today = datetime.date.today()
year = today.year

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
}

# Fetch Archive game logs
archive_url = "https://stilesdata.com/dodgers/data/batting/archive/dodgers_team_cumulative_batting_logs_1958_2023.parquet"
archive_df = pd.read_parquet(archive_url)

# Fetch Current game logs
current_url = f"https://www.baseball-reference.com/teams/tgl.cgi?team=LAD&t=b&year={year}"
current_df = pd.read_html(current_url)[0].assign(year=year).query('HR != "HR"')
current_df.columns = current_df.columns.str.lower()

# Process current game logs
current_df["game_date"] = pd.to_datetime(
    current_df["date"] + " " + current_df["year"].astype(str),
    format="%b %d %Y",
    errors="coerce"
).dt.strftime("%Y-%m-%d")

# Drop unnecessary columns
drop_cols = [
    "rk", "date", "unnamed: 3", "opp", "rslt", "ba", "obp", "slg", "ops", "lob", "#", "thr", "opp. starter (gmesc)"
]
current_df = current_df.drop(drop_cols, axis=1).copy()

# Define value columns
val_cols = [
    "gtm", "pa", "ab", "r", "h", "2b", "3b", "hr", "rbi", "bb", "ibb", "so", "hbp", "sh", "sf", "roe", "gdp", "sb", "cs"
]

# Convert value columns to integers
current_df[val_cols] = current_df[val_cols].astype(int)

# Calculate cumulative columns
for col in val_cols:
    current_df[f"{col}_cum"] = current_df.groupby("year")[col].cumsum()
current_df = current_df.drop("gtm_cum", axis=1)

# Combine current and archive data
df = (
    pd.concat([current_df, archive_df])
    .sort_values(["year", "gtm"], ascending=[False, True])
    .reset_index(drop=True)
    .drop_duplicates()
)

# Optimize DataFrame for output
optimized_df = df[
    ["gtm", "year", "r_cum", "h_cum", "2b_cum", "bb_cum", "so_cum", "hr_cum"]
].copy()

# Function to save DataFrame to S3
def save_to_s3(df, base_path, s3_bucket, formats):
    for fmt in formats:
        try:
            buffer = BytesIO()
            if fmt == "csv":
                df.to_csv(buffer, index=False)
                content_type = "text/csv"
            elif fmt == "json":
                df.to_json(buffer, indent=4, orient="records", lines=False)
                content_type = "application/json"
            elif fmt == "parquet":
                df.to_parquet(buffer, index=False)
                content_type = "application/octet-stream"
            buffer.seek(0)
            s3_resource.Bucket(s3_bucket).put_object(Key=f"{base_path}.{fmt}", Body=buffer, ContentType=content_type)
            logging.info(f"Uploaded {fmt} to {s3_bucket}/{base_path}.{fmt}")
        except Exception as e:
            logging.error(f"Failed to upload {fmt} to S3: {e}")

# Saving files locally and to S3
file_path = os.path.join(data_dir, 'dodgers_historic_pitching_gamelogs')
formats = ["csv", "json", "parquet"]
# save_dataframe(optimized_df, file_path, formats)
save_to_s3(optimized_df, "dodgers/data/batting/archive/dodgers_historic_pitching_gamelogs", "stilesdata.com", formats)