# Import Packages

In [7]:
import numpy as np
import pandas as pd
import requests
import math
import time
import datetime
import io

# Extracting Statcast data (e.g. MiLB Data)

In [8]:
def get_single_day_data(date_str):
    """
    Retrieve MiLB Savant data for a specific day.

    Parameters:
        date_str (str): Date in "YYYY-MM-DD" format.

    Returns:
        pd.DataFrame: Data for the specified day. Returns an empty DataFrame if no data is available.
    """
    # Construct the URL with the date parameter to fetch the data
    url = (
        "https://baseballsavant.mlb.com/statcast-search-minors/csv?"
        "all=true&player_type=pitcher&game_date_gt={date}&game_date_lt={date}"
        "&type=details&minors=true&"
    ).format(date=date_str)
    response = requests.get(url, timeout=None)
    content = response.content
    data = pd.read_csv(io.StringIO(content.decode('utf-8')))
    return data

def extract_milb_data(start_date, end_date):
    """
    Retrieve and concatenate MiLB Savant data for a specified date range.

    Parameters:
        start_date (datetime.date): Start date for data retrieval.
        end_date (datetime.date): End date for data retrieval.

    Returns:
        pd.DataFrame: Data for the specified period. Returns an empty DataFrame if no data is available.
    """
    # List to store daily DataFrames
    df_list = []
    current_date = start_date

    # Loop through each day in the specified date range
    while current_date <= end_date:
        # Convert the date to string format (e.g., "2023-08-16")
        date_str = current_date.strftime("%Y-%m-%d")
        # Retrieve data for the specific day
        df_day = get_single_day_data(date_str)
        # Add to the list only if data is retrieved
        if not df_day.empty:
            df_list.append(df_day)
        # Move to the next day
        current_date += datetime.timedelta(days=1)

    # Concatenate the daily DataFrames. Return an empty DataFrame if no data was collected.
    if df_list:
        df_total = pd.concat(df_list, ignore_index=True)
    else:
        df_total = pd.DataFrame()

    return df_total

In [9]:
# Specify start_date and end_date (e.g. from 2024-05-01 to 2024-05-10)
start_date = datetime.date(2024, 5, 1)
end_date = datetime.date(2024, 5, 10)

# Function Execution
df = extract_milb_data(start_date, end_date)

# Display the first 5 records of the dataframe
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,FF,2024-05-01,92.5,1.15,6.07,"Povich, Cade",657562,700249,strikeout,called_strike,...,3,2,,,,,1.16,0.79,0.79,
1,ST,2024-05-01,80.0,1.24,5.93,"Povich, Cade",657562,700249,,foul,...,3,2,,,,,3.22,-1.12,-1.12,
2,FF,2024-05-01,90.7,1.1,6.05,"Povich, Cade",657562,700249,,ball,...,3,2,,,,,1.1,0.59,0.59,
3,FC,2024-05-01,85.7,1.18,5.96,"Povich, Cade",657562,700249,,swinging_strike,...,3,2,,,,,2.35,-0.38,-0.38,
4,FF,2024-05-01,91.9,1.11,6.12,"Povich, Cade",657562,700249,,ball,...,3,2,,,,,1.07,0.64,0.64,


# Estimating arm angle using a factor of 70% of pitcher height

In [12]:
class EstimateArmAngle:
    """
    A class for estimating pitcher arm angles by leveraging height data
    fetched from MLB's API and combining it with pitch tracking metrics.
    """

    def __init__(self):
        # Base URL for MLB's Stats API
        self.base_url = "https://statsapi.mlb.com/api/v1"
        # Initialize cache to store pitcher heights and reduce API calls
        self.heights_cache = {}

    def convert_height_to_inches(self, height_str):
        """
        Convert MLB height string format (e.g., "6'2"") to total inches.
        """
        try:
            # Parse the feet and inches components
            parts = height_str.replace('"', '').split("'")
            feet = int(parts[0].strip())
            inches = int(parts[1].strip()) if parts[1].strip() else 0
            return feet * 12 + inches
        except (ValueError, IndexError):
            return None

    def get_pitcher_height(self, pitcher_id):
        """
        Retrieve pitcher height from MLB API with caching for efficiency.
        """
        # Return cached height if available
        if pitcher_id in self.heights_cache:
            return self.heights_cache[pitcher_id]

        url = f"{self.base_url}/people/{pitcher_id}"
        try:
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                if 'people' in data and len(data['people']) > 0:
                    height_str = data['people'][0].get('height')
                    if height_str:
                        height = self.convert_height_to_inches(height_str)
                        # Store in cache for future lookups
                        self.heights_cache[pitcher_id] = height
                        return height
            # Prevent API rate limiting with small delay
            time.sleep(0.1)
            return None
        except Exception as e:
            print(f"Error fetching data for pitcher ID {pitcher_id}: {e}")
            return None

    def add_pitcher_heights(self, df, pitcher_id_column='pitcher'):
        """
        Enrich baseball dataset with pitcher height information from MLB API.
        """
        # Identify all unique pitchers in the dataset
        unique_pitchers = pd.DataFrame({pitcher_id_column: df[pitcher_id_column].unique()})
        print(f"Found {len(unique_pitchers)} unique pitchers")

        # Batch retrieve heights for all pitchers
        print("Fetching height data from MLB API...")
        heights = {}
        for pitcher_id in unique_pitchers[pitcher_id_column]:
            height = self.get_pitcher_height(pitcher_id)
            if height:
                heights[pitcher_id] = height

        # Create lookup table for heights
        pitcher_heights_df = pd.DataFrame({
            pitcher_id_column: list(heights.keys()),
            'height': list(heights.values())
        })

        # Join height data with original dataset

        # Report success statistics
        print(f"Added height data for {len(heights)} pitchers")
        print(f"Missing height data for {len(unique_pitchers) - len(heights)} pitchers")

        return pd.merge(df, pitcher_heights_df, on=pitcher_id_column, how='left')

    def calculate_arm_angle(self, df):
        """
        Calculate estimated arm angle based on release point and pitcher height.
        """
        # Make a copy to avoid modifying the original dataframe
        result_df = df.copy()

        # Check if required columns exist
        required_cols = ['release_pos_x', 'release_pos_z', 'height']
        missing_cols = [col for col in required_cols if col not in result_df.columns]
        if missing_cols:
            print(f"Error: Missing required columns: {missing_cols}")
            return result_df

        # Filter rows with necessary data
        mask = result_df['release_pos_x'].notna() & result_df['release_pos_z'].notna() & result_df['height'].notna()
        valid_rows = result_df[mask].index
        print(f"Found {len(valid_rows)} valid rows for arm angle calculation")

        # Only process rows with valid data
        if len(valid_rows) > 0:
            # Convert units to inches for valid rows
            result_df.loc[valid_rows, 'release_pos_x_inches'] = abs(result_df.loc[valid_rows, 'release_pos_x']) * 12
            result_df.loc[valid_rows, 'release_pos_z_inches'] = result_df.loc[valid_rows, 'release_pos_z'] * 12

            # Calculate shoulder height (estimated as 70% of player height)
            result_df.loc[valid_rows, 'sld_z'] = result_df.loc[valid_rows, 'height'] * 0.7

            # Calculate arm angle using arctangent (z/x)
            result_df.loc[valid_rows, 'arm_angle_estimate'] = np.degrees(np.arctan(
                (result_df.loc[valid_rows, 'release_pos_z_inches'] - result_df.loc[valid_rows, 'sld_z']) /
                abs(result_df.loc[valid_rows, 'release_pos_x_inches'])
            ))

        return result_df

    def process_pitch_data(self, df, pitcher_id_column='pitcher'):
        """
        Complete workflow to process pitch data: add heights and calculate arm angles.

        Args:
            df (pd.DataFrame): Raw pitch data
            pitcher_id_column (str): Column name for pitcher IDs

        Returns:
            pd.DataFrame: Processed data with added height and arm angle metrics
        """
        # Step 1: Add pitcher heights from MLB API
        df_with_heights = self.add_pitcher_heights(df, pitcher_id_column)

        # Step 2: Calculate arm angles
        processed_df = self.calculate_arm_angle(df_with_heights)

        return processed_df

In [14]:
# Create an instance of the class
arm_angle = EstimateArmAngle()

# Add height and estimated arm_angle data the dataframe
df_arm_angle = arm_angle.process_pitch_data(df)

# Display the first 5 records of the dataframe
df_arm_angle.head()

Found 695 unique pitchers
Fetching height data from MLB API...
Added height data for 695 pitchers
Missing height data for 0 pitchers
Found 54294 valid rows for arm angle calculation


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,height,release_pos_x_inches,release_pos_z_inches,sld_z,arm_angle_estimate
0,FF,2024-05-01,92.5,1.15,6.07,"Povich, Cade",657562,700249,strikeout,called_strike,...,,1.16,0.79,0.79,,75,13.8,72.84,52.5,55.844439
1,ST,2024-05-01,80.0,1.24,5.93,"Povich, Cade",657562,700249,,foul,...,,3.22,-1.12,-1.12,,75,14.88,71.16,52.5,51.430173
2,FF,2024-05-01,90.7,1.1,6.05,"Povich, Cade",657562,700249,,ball,...,,1.1,0.59,0.59,,75,13.2,72.6,52.5,56.706437
3,FC,2024-05-01,85.7,1.18,5.96,"Povich, Cade",657562,700249,,swinging_strike,...,,2.35,-0.38,-0.38,,75,14.16,71.52,52.5,53.333072
4,FF,2024-05-01,91.9,1.11,6.12,"Povich, Cade",657562,700249,,ball,...,,1.07,0.64,0.64,,75,13.32,73.44,52.5,57.539432
