# Import Packages

In [1]:
import datetime
import io
import pandas as pd
import requests

# Extract MLB Data

In [2]:
def get_single_day_data(date_str):
    """
    Retrieve MLB Savant data for a specific day without pybaseball.

    Parameters:
        date_str (str): Date in "YYYY-MM-DD" format.

    Returns:
        pd.DataFrame: Data for the specified day. Returns an empty DataFrame if no data is available.
    """
    # Construct the URL with the date parameter to fetch the data
    url = (
        "https://baseballsavant.mlb.com/statcast_search/csv?"
        "all=true&player_type=pitcher&game_date_gt={date}&game_date_lt={date}"
        "&type=details&"
    ).format(date=date_str)
    response = requests.get(url, timeout=None)
    content = response.content
    data = pd.read_csv(io.StringIO(content.decode('utf-8')))
    return data

def extract_mlb_data(start_date, end_date):
    """
    Retrieve and concatenate MLB Savant data for a specified date range.

    Parameters:
        start_date (datetime.date): Start date for data retrieval.
        end_date (datetime.date): End date for data retrieval.

    Returns:
        pd.DataFrame: Data for the specified period. Returns an empty DataFrame if no data is available.
    """
    # List to store daily DataFrames
    df_list = []
    current_date = start_date

    # Loop through each day in the specified date range
    while current_date <= end_date:
        # Convert the date to string format (e.g., "2023-08-16")
        date_str = current_date.strftime("%Y-%m-%d")
        # Retrieve data for the specific day
        df_day = get_single_day_data(date_str)
        # Add to the list only if data is retrieved
        if not df_day.empty:
            df_list.append(df_day)
        # Move to the next day
        current_date += datetime.timedelta(days=1)

    # Concatenate the daily DataFrames. Return an empty DataFrame if no data was collected.
    if df_list:
        df_total = pd.concat(df_list, ignore_index=True)
    else:
        df_total = pd.DataFrame()

    return df_total

In [3]:
# Specify start_date and end_date (e.g. from 2024-05-01 to 2024-05-10)
start_date = datetime.date(2024, 5, 1)
end_date = datetime.date(2024, 5, 10)

# Function Execution
df = extract_mlb_data(start_date, end_date)

# Display the first 5 records of the dataframe
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,CU,2024-05-01,81.8,-2.25,5.29,"Wheeler, Zack",650859,554430,field_out,hit_into_play,...,3,2,6.0,1.0,5.0,2.0,4.13,-0.65,0.65,26.5
1,FC,2024-05-01,91.8,-2.3,5.1,"Wheeler, Zack",650859,554430,,foul,...,3,2,6.0,1.0,5.0,2.0,1.79,0.2,-0.2,24.1
2,FC,2024-05-01,91.7,-2.25,5.08,"Wheeler, Zack",650859,554430,,ball,...,3,2,6.0,1.0,5.0,2.0,2.19,0.05,-0.05,22.7
3,FF,2024-05-01,95.1,-2.23,5.15,"Wheeler, Zack",650859,554430,,ball,...,3,2,6.0,1.0,5.0,2.0,1.34,0.93,-0.93,25.6
4,SL,2024-05-01,89.7,1.06,6.01,"Sandoval, Patrick",547180,663776,strikeout,called_strike,...,3,2,5.0,1.0,6.0,2.0,2.71,-0.77,-0.77,56.3
