# Import Packages

In [1]:
import datetime
import io
import pandas as pd
import requests

# Extract MiLB Data

In [2]:
def get_single_day_data(date_str):
    """
    Retrieve MiLB Savant data for a specific day.

    Parameters:
        date_str (str): Date in "YYYY-MM-DD" format.

    Returns:
        pd.DataFrame: Data for the specified day. Returns an empty DataFrame if no data is available.
    """
    # Construct the URL with the date parameter to fetch the data
    url = (
        "https://baseballsavant.mlb.com/statcast-search-minors/csv?"
        "all=true&player_type=pitcher&game_date_gt={date}&game_date_lt={date}"
        "&type=details&minors=true&"
    ).format(date=date_str)
    response = requests.get(url, timeout=None)
    content = response.content
    data = pd.read_csv(io.StringIO(content.decode('utf-8')))
    return data

def extract_milb_data(start_date, end_date):
    """
    Retrieve and concatenate MiLB Savant data for a specified date range.

    Parameters:
        start_date (datetime.date): Start date for data retrieval.
        end_date (datetime.date): End date for data retrieval.

    Returns:
        pd.DataFrame: Data for the specified period. Returns an empty DataFrame if no data is available.
    """
    # List to store daily DataFrames
    df_list = []
    current_date = start_date

    # Loop through each day in the specified date range
    while current_date <= end_date:
        # Convert the date to string format (e.g., "2023-08-16")
        date_str = current_date.strftime("%Y-%m-%d")
        # Retrieve data for the specific day
        df_day = get_single_day_data(date_str)
        # Add to the list only if data is retrieved
        if not df_day.empty:
            df_list.append(df_day)
        # Move to the next day
        current_date += datetime.timedelta(days=1)

    # Concatenate the daily DataFrames. Return an empty DataFrame if no data was collected.
    if df_list:
        df_total = pd.concat(df_list, ignore_index=True)
    else:
        df_total = pd.DataFrame()

    return df_total

In [4]:
# Specify start_date and end_date (e.g. from 2024-05-01 to 2024-05-10)
start_date = datetime.date(2024, 5, 1)
end_date = datetime.date(2024, 5, 10)

# Function Execution
df = extract_milb_data(start_date, end_date)

# Display the first 5 records of the dataframe
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,FF,2024-05-01,92.5,1.15,6.07,"Povich, Cade",657562,700249,strikeout,called_strike,...,3,2,,,,,1.16,0.79,0.79,
1,ST,2024-05-01,80.0,1.24,5.93,"Povich, Cade",657562,700249,,foul,...,3,2,,,,,3.22,-1.12,-1.12,
2,FF,2024-05-01,90.7,1.1,6.05,"Povich, Cade",657562,700249,,ball,...,3,2,,,,,1.1,0.59,0.59,
3,FC,2024-05-01,85.7,1.18,5.96,"Povich, Cade",657562,700249,,swinging_strike,...,3,2,,,,,2.35,-0.38,-0.38,
4,FF,2024-05-01,91.9,1.11,6.12,"Povich, Cade",657562,700249,,ball,...,3,2,,,,,1.07,0.64,0.64,
