# LA Dodgers pitching
> This notebook downloads the team's current pitching table from [Baseball Reference](https://www.baseball-reference.com/teams/LAD/2024-pitching.shtml#all_team_pitching) and outputs the data to CSV, JSON and Parquet formats for later analysis and visualization.

---

#### Import Python tools and Jupyter config

In [1]:
import os
import boto3
import pandas as pd
import jupyter_black
from io import BytesIO
from io import StringIO
from tqdm.notebook import tqdm

In [2]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None
aws_key_id = os.environ.get("HAEKEO_AWS_KEY")
aws_secret_key = os.environ.get("HAEKEO_AWS_SECRET")

In [3]:
boto3.Session(
    aws_access_key_id=aws_key_id,
    aws_secret_access_key=aws_secret_key,
    region_name="us-west-1",
)

Session(region_name='us-west-1')

---

## Fetch

#### Pitching for the current season

In [4]:
year = pd.to_datetime("now").strftime("%Y")

In [5]:
url = f"https://www.baseball-reference.com/teams/LAD/{year}-pitching.shtml#all_team_pitching"

---

## Team stats

In [6]:
summary_df = (
    pd.read_html(url)[0]
    .query(f"Rk.isna() and Rk != 'Rk'")
    .dropna(thresh=7)
    .assign(season=year)
)
summary_df.columns = summary_df.columns.str.lower()

#### Ranks

In [7]:
ranks = (
    summary_df.query('name == "Rank in 15 NL teams"')
    .dropna(axis=1)
    .reset_index(drop=True)
).copy()

#### Totals

In [8]:
totals = (
    summary_df.query('name == "Team Totals"')
    .dropna(axis=1)
    .reset_index(drop=True)
    .copy()
)

In [9]:
ranks["era"].iloc[0]

'1'

In [10]:
totals["era"].iloc[0]

'3.16'

In [11]:
summary_df.head()

Unnamed: 0,rk,pos,name,age,w,l,w-l%,era,g,gs,gf,cg,sho,sv,ip,h,r,er,hr,bb,ibb,so,hbp,bk,wp,bf,era+,fip,whip,h9,hr9,bb9,so9,so/w,season
25,,,Team Totals,29.5,29,15,0.659,3.16,44.0,44.0,44.0,0,0,14,396.1,307,152,139,44,131,4.0,368,18.0,1.0,18.0,1609.0,131.0,3.85,1.105,7.0,1.0,3.0,8.4,2.81,2024
26,,,Rank in 15 NL teams,,2,13,,1.0,,,,4,4,2,2.0,3,2,3,7,5,,6,,,,,,,,,,,,,2024


---

## Export

#### Function to save dataframes with different formats and file extensions

In [12]:
def save_dataframe(df, path_without_extension, formats):
    """
    Save DataFrames in multiple formats.
    """
    for file_format in formats:
        if file_format == "csv":
            df.to_csv(f"{path_without_extension}.{file_format}", index=False)
        elif file_format == "json":
            df.to_json(
                f"{path_without_extension}.{file_format}", indent=4, orient="records"
            )
        elif file_format == "parquet":
            df.to_parquet(f"{path_without_extension}.{file_format}", index=False)
        else:
            print(f"Unsupported format: {file_format}")

#### Save local files

In [13]:
formats = ["csv", "json", "parquet"]
save_dataframe(totals, f"../data/pitching/dodgers_pitching_totals_current", formats)
save_dataframe(
    ranks,
    f"../data/pitching/dodgers_pitching_ranks_current",
    formats,
)

In [14]:
def save_to_s3(
    df, base_path, s3_bucket, formats=["csv", "json", "parquet"], profile_name="default"
):
    """
    Save Pandas DataFrame in specified formats and upload to S3 bucket using a specified AWS profile.

    :param df: DataFrame to save.
    :param base_path: Base file path without format extension.
    :param s3_bucket: S3 bucket name.
    :param formats: List of formats to save -- 'csv', 'json', 'parquet'.
    :param profile_name: AWS CLI profile name to use for credentials.
    """
    session = boto3.Session(profile_name=profile_name)
    s3_resource = session.resource("s3")

    for fmt in formats:
        file_path = f"{base_path}.{fmt}"
        if fmt == "csv":
            buffer = BytesIO()
            df.to_csv(buffer, index=False)
            content_type = "text/csv"
        elif fmt == "json":
            buffer = BytesIO()
            df.to_json(buffer, orient="records", lines=True)
            content_type = "application/json"
        elif fmt == "parquet":
            buffer = BytesIO()
            df.to_parquet(buffer, index=False)
            content_type = "application/octet-stream"

        buffer.seek(0)
        s3_resource.Bucket(s3_bucket).put_object(
            Key=file_path, Body=buffer, ContentType=content_type
        )
        print(f"Uploaded {fmt} to {s3_bucket}/{file_path}")

In [15]:
# Save to S3
save_to_s3(
    totals,
    "dodgers/data/pitching/dodgers_pitching_totals_current",
    "stilesdata.com",
    profile_name="haekeo",
)
save_to_s3(
    ranks,
    "dodgers/data/pitching/dodgers_pitching_ranks_current",
    "stilesdata.com",
    profile_name="haekeo",
)

Uploaded csv to stilesdata.com/dodgers/data/pitching/dodgers_pitching_totals_current.csv
Uploaded json to stilesdata.com/dodgers/data/pitching/dodgers_pitching_totals_current.json
Uploaded parquet to stilesdata.com/dodgers/data/pitching/dodgers_pitching_totals_current.parquet
Uploaded csv to stilesdata.com/dodgers/data/pitching/dodgers_pitching_ranks_current.csv
Uploaded json to stilesdata.com/dodgers/data/pitching/dodgers_pitching_ranks_current.json
Uploaded parquet to stilesdata.com/dodgers/data/pitching/dodgers_pitching_ranks_current.parquet


In [16]:
# Save a copy of notebook as python script
!jupyter nbconvert --to script --no-prompt --output ../scripts/05_fetch_process_pitching 08_fetch_process_pitching.ipynb

[NbConvertApp] Converting notebook 08_fetch_process_pitching.ipynb to script
[NbConvertApp] Writing 4387 bytes to ../scripts/05_fetch_process_pitching.py
