<a href="https://colab.research.google.com/github/sharinkan/K_StuffPlus/blob/main/load_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.4.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting Deprecated (from pygithub>=1.51->pybaseball)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyGithub-2.4.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.6/362.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━

In [None]:
import os
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import timedelta
from pybaseball import statcast, playerid_lookup, player_search_list, playerid_reverse_lookup, pitching_stats, batting_stats

# Statcasat Data Scraping

In [None]:
# Scrape Statcast data by season
def get_statcast_by_season(year, start_date='03-15', end_date="10-01"):
    # Initialize an empty dataframe to store data from multiple years
    all_data = pd.DataFrame()

    # Construct the start_date and end_date for the given year
    full_start_date = f"{year}-{start_date}"  # Combine year with the start_date (MM-DD)
    full_end_date = f"{year}-{end_date}"      # Combine year with the end_date (MM-DD)

    # Fetch Statcast data for the specified period
    df = statcast(start_dt=full_start_date, end_dt=full_end_date)

    # Append the data to the all_data dataframe
    all_data = pd.concat([all_data, df])

    all_data = all_data.rename(
              columns = {
                          "away_team": "vis_team",
                          "batter": "batterid",
                          "pitcher": "pitcherid",
                          "events": "event_type",
                          "des": "event_description",
                          "description": "pitch_description",
                          "outs_when_up": "outs_before",
                          "hit_distance_sc": "hit_distance",
                          "at_bat_number": "pa_number",
                          "bat_score": "bat_score_before",
                          "game_pk": "gameid",
                          "fld_score": "field_score",
                          "release_spin_rate": "release_spin"
                  }
          )
    # Order the data frame in alphabetical order
    all_data = all_data.reindex(sorted(all_data.columns), axis=1)

    return all_data


# Save statcast data to your personal directory
def df_to_csv(df, directory, filename):
    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Full path for the CSV file
    file_path = os.path.join(directory, filename)

    # Save the DataFrame as a CSV file
    df.to_csv(file_path, index=False)

    print(f"DataFrame saved to {file_path}")
    return file_path

# Data Manipulation

In [None]:
# Function to add Batter Names
def id_to_name(df):
    batter_ids = df['batterid'].unique()
    pitcher_ids = df['pitcherid'].unique()
    fielder_names = df['fielderid'].unique()

    batter_names = playerid_reverse_lookup(batter_ids)
    pitcher_names = playerid_reverse_lookup(pitcher_ids)
    fielder_names = playerid_reverse_lookup(fielder_names)

    # Select relevant columns from batter_names DataFrame
    batter_names = batter_names[['key_mlbam', 'name_first', 'name_last']]
    pitcher_names = pitcher_names[['key_mlbam', 'name_first', 'name_last']]
    fielder_names = fielder_names[['key_mlbam', 'name_first', 'name_last']]

    batter_names['batter_name'] = batter_names['name_first'] + ' ' + batter_names['name_last']
    pitcher_names['pitcher_name'] = pitcher_names['name_first'] + ' ' + pitcher_names['name_last']
    fielder_names['fielder_name'] = fielder_names['name_first'] + ' ' + fielder_names['name_last']

    # Merge and add batter name
    df = df.merge(batter_names, left_on='batterid', right_on='key_mlbam', how='left').drop(columns=['key_mlbam', 'name_first', 'name_last'])
    df = df.merge(pitcher_names, left_on='pitcherid', right_on='key_mlbam', how='left').drop(columns=['key_mlbam', 'name_first', 'name_last'])
    df = df.merge(fielder_names, left_on='fielderid', right_on='key_mlbam', how='left').drop(columns=['key_mlbam', 'name_first', 'name_last'])

    return df



# Function to format the Statcast data
def format_statcast(df):
    # Fill missing pitch type with 'UN' for unknown
    df['pitch_type'].fillna('UN', inplace=True)

    # Expand game_type codes for clarity
    df['game_type'] = df['game_type'].map({
        'E': 'Exhibition',
        'S': 'Spring Training',
        'R': 'Regular Season',
        'F': 'Wild Card',
        'D': 'Divisional Series',
        'L': 'League Championship Series',
        'W': 'World Series'
    })

    # Create binary handedness indicators
    df['is_lhb'] = np.where(df['stand'] == 'L', 1, 0)
    df['is_lhp'] = np.where(df['p_throws'] == 'L', 1, 0)

    # Create fielder ID to accompany hit_location
    fielderid_map = {
        1: 'pitcherid', 2: 'fielder_2', 3: 'fielder_3',
        4: 'fielder_4', 5: 'fielder_5', 6: 'fielder_6',
        7: 'fielder_7', 8: 'fielder_8', 9: 'fielder_9'
    }

    for key, value in fielderid_map.items():
        # Use the fielder columns to fill the 'fielderid' based on hit_location
        df.loc[df['hit_location'] == key, 'fielderid'] = df[value].reset_index(drop=True)

    # Convert 'fielderid' to numeric values
    df['fielderid'] = pd.to_numeric(df['fielderid'], errors='coerce')

    # Create binary inning half indicator
    df['is_bottom'] = np.where(df['inning_topbot'] == 'Bot', 1, 0)

    # Add spray angle
    df['spray_angle'] = np.round(np.degrees(np.arctan((df['hc_x'] - 125.42) / (198.27 - df['hc_y'])) * 0.75), 1)

    # Standardize team abbreviations (some deprecated)
    team_map = {
        'FLA': 'MIA', 'KC': 'KCR', 'SD': 'SDP',
        'SF': 'SFG', 'TB': 'TBR'
    }
    df['home_team'] = df['home_team'].replace(team_map)
    df['vis_team'] = df['vis_team'].replace(team_map)

    # Runner status
    df['run_on_1b'] = df['on_1b'].replace(0, np.nan)
    df['run_on_2b'] = df['on_2b'].replace(0, np.nan)
    df['run_on_3b'] = df['on_3b'].replace(0, np.nan)

    # Pitch information
    df['is_bip'] = np.where(df['type'] == 'X', 1, 0)
    df['is_stk'] = np.where(df['type'] == 'S', 1, 0)

    # Velocity rounded up to whole numbers
    df['velocity'] = np.where(df['release_speed'] >= np.floor(df['release_speed']) + 0.5, np.ceil(df['release_speed']), np.floor(df['release_speed']))

    # Create Pitch Bucket Column
    conditions = [
        df['pitch_type'].isin(['FF', 'SI']),  # Fastballs (4-seam, sinker)
        df['pitch_type'].isin(['FC', 'SL', 'CU', 'ST']),  # Breaking (cutter, slider, curve, sweeper)
        df['pitch_type'].isin(['CH', 'FS'])  # Offspeed (changeup, splitter)
    ]
    choices = ['Fastball', 'Breaking', 'Offspeed']
    df['pitch_bucket'] = np.select(conditions, choices, default='Other')
    # Drop all non-relevant pitch types
    df = df[df['pitch_bucket'] != 'Other']
    # Update the index
    df = df.reset_index(drop=True)

    # Convert ft to inches
    df['IVB'], df['HB'] = df['pfx_z'] * 12, df['pfx_x'] * 12

    df = id_to_name(df)

    return df


def format_statcast2(df):
    df = df.sort_values(by=['gameid', 'pa_number', 'pitch_number'])
    # Pitch count by each pitcher
    df['pitch_count'] = df.groupby(['gameid', 'pa_number','pitcherid']).cumcount() + 1
    df['total_pitch'] = df.groupby(['gameid', 'pitcherid']).cumcount()+1
    # Correct pitch numbering, ball, and strike counts
    df['pitch_number'] = df.groupby(['gameid', 'pa_number', 'batterid']).cumcount() + 1
    # Indicator to tell if it is last pitch of a player for each plate apperance
    df['is_last_pitch'] = df.groupby(['gameid', 'pa_number', 'batterid'])['pitch_number'].transform('max') == df['pitch_number']


    # Baseout state before PA event
    def basecode(row):
        return ''.join(['1' if pd.notna(row[col]) else '0' for col in ['run_on_1b', 'run_on_2b', 'run_on_3b']])

    df['basecode_before'] = df.apply(basecode, axis=1)
    df['basecode_after'] = df.groupby(['gameid', 'inning', 'is_bottom'])['basecode_before'].shift(-1).fillna('000')

    # Add post PA information (outs, base state)
    df['outs_after'] = df.groupby(['gameid', 'inning', 'is_bottom'])['outs_before'].shift(-1).fillna(3).astype(int)

    df['baseout_state_before'] = df['basecode_before'] + ' ' + df['outs_before'].astype(str)
    df['baseout_state_after'] = df['basecode_after'] + ' ' + df['outs_after'].astype(str)

    # Correct score updates
    df['bat_score_after'] = np.where(df['is_bottom'] == 1, df['home_score'], df['away_score'])
    df['bat_score_after'] = df.groupby(['gameid', 'inning', 'is_bottom'])['bat_score_after'].shift(-1).fillna(df['bat_score_before']).astype(int)

    # # Changing records where ball count shows 4
    # df.loc[df['balls'] == 4, 'balls'] = 3

    df['b_str'] = df['balls'].astype(str)
    df['s_str'] = df['strikes'].astype(str)

    df['bs_count_before'] = df['b_str'] + '-' + df['s_str']
    df['bs_count_after'] = np.where(df['is_last_pitch'] == 1, '0-0', df.groupby(['gameid', 'pa_number'])['bs_count_before'].shift(-1).fillna(df['bs_count_before']))

    df['full_state_before'] = df['baseout_state_before'] + ' ' + df['bs_count_before']
    df['full_state_after'] = df['baseout_state_after'] + ' ' + df['bs_count_after']

    return df


def format_re24(df):
    # Calculate runs scored in the remainder of the inning
    # runs_roi = runs (total in innint) - runs (so far in inning)
    df['runs_roi'] = df.groupby(['gameid', 'inning', 'is_bottom'])['bat_score_after'].transform('max') - df['bat_score_before']

    # Calculation of the run expectancy across 24 different combinations of runner on base and out status from selected season (or seasons)
    re24 = df.groupby('baseout_state_before')['runs_roi'].mean().reset_index()
    # Add the missing "000 3" state with a mean ROI of 0
    missing_state = pd.DataFrame({'baseout_state_before': ['000 3'], 'runs_roi': [0]})

    re24 = pd.concat([re24, missing_state], ignore_index=True)
    re24 = re24.rename(columns={'baseout_state_before': 'state', 'runs_roi': 'mean_roi'})

    # Join with `re24` on `baseout_state_before` and `baseout_state_after`
    df = df.merge(re24, left_on='baseout_state_before', right_on='state', how='left', suffixes=('', '_before'))
    df.rename(columns={'mean_roi': 're24_before'}, inplace=True)

    df = df.merge(re24, left_on='baseout_state_after', right_on='state', how='left', suffixes=('', '_after'))
    df.rename(columns={'mean_roi': 're24_after'}, inplace=True)

    df['dre_24'] = df['re24_after'] - df['re24_before']

    # Calculate delta run value for
    df['rv_24'] = df['bat_score_after'] - df['bat_score_before'] + df['dre_24']

    # # Calculate `cnrv_24` by `event_type`
    # df['cnrv_24'] = df.groupby('event_type')['cdrv_24'].transform('mean')

    return df






def format_re288(df):
    df['runs_roi'] = df.groupby(['gameid', 'inning', 'is_bottom'])['bat_score_after'].transform('max') - df['bat_score_before']

    # Calculate mean ROI by full_state_before
    re288 = df.groupby('full_state_before')['runs_roi'].mean().reset_index()
    # Add the missing "000 3 0-0" state with a mean ROI of 0
    missing_state_288 = pd.DataFrame({'full_state_before': ['000 3 0-0'], 'runs_roi': [0]})

    re288 = pd.concat([re288, missing_state_288], ignore_index=True)
    re288 = re288.rename(columns={'full_state_before': 'state', 'runs_roi': 'mean_roi'})

    # Convert mean_roi column to numeric
    re288['mean_roi'] = pd.to_numeric(re288['mean_roi'])

    # Join with `re288` on `full_state_before` and `full_state_after`
    df = df.merge(re288, left_on='full_state_before', right_on='state', how='left', suffixes=('', '_before'))
    df.rename(columns={'mean_roi': 're288_before'}, inplace=True)

    df = df.merge(re288, left_on='full_state_after', right_on='state', how='left', suffixes=('', '_after'))
    df.rename(columns={'mean_roi': 're288_after'}, inplace=True)


    df['dre_288'] = df['re288_after'] - df['re288_before']


    # Delta Run Value Metric (Runs Scored + RE after event - RE before event)
    df['rv_288'] = df['bat_score_after'] - df['bat_score_before'] + df['dre_288']

    # # Create `cn_event`, calculate `cdrv_288`
    # df['cn_event'] = np.where(df['is_last_pitch'] == 1,
    #                           df['balls'].astype(str) + "-" + df['strikes'].astype(str) + " " + df['event_type'],
    #                           np.where(df['is_stk'] == 1,
    #                                   df['balls'].astype(str) + "-" + df['strikes'].astype(str) + " strike",
    #                                   df['balls'].astype(str) + "-" + df['strikes'].astype(str) + " ball"))



    # # Calculate `cnrv_288` by `cn_event`
    # df['cnrv_288'] = df.groupby('cn_event')['cdrv_288'].transform('mean')

    return df


# def format_statcast4(df):
#     # Create rel_plate_x and rel_plate_z
#     df['rel_plate_x'] = df['plate_x'] / ((17/2 + 1.456) / 12)
#     df['rel_plate_z'] = (df['plate_z'] - (df['sz_bot'] - 1.456/12)) / (df['sz_top'] - df['sz_bot'] + 1.456*2/12) * 2 - 1
#     # Define attack_region
#     df['attack_region'] = np.where(
#         (abs(df['rel_plate_x']) < 0.67) & (abs(df['rel_plate_z']) < 0.67), 'Heart',
#         np.where((abs(df['rel_plate_x']) < 1.33) & (abs(df['rel_plate_z']) < 1.33), 'Shadow',
#         np.where((abs(df['rel_plate_x']) < 2.00) & (abs(df['rel_plate_z']) < 2.00), 'Chase', 'Waste'))
#     )
#     # Handle missing values
#     df['attack_region'] = np.where(df['plate_x'].isna() | df['plate_z'].isna(), np.nan, df['attack_region'])

#     return df


def df_by_pitch_count(df, count):
    broadcast = df['pitcherid'].isin(df['pitcherid'].value_counts()[df['pitcherid'].value_counts() > count].index)
    df = df[broadcast]
    return df


def get_release_angle(df):
    """
    Process Statcast DataFrame to add calculated columns such as 'yR', 'tR', 'vxR', 'vyR', 'vzR',
    'HRA', 'VRA', 'HRA_deg', 'VRA_deg', and drop rows with nulls in 'pitch_type', 'vxR', 'vyR', 'vzR'.

    Parameters:
    - df (pd.DataFrame): The input Statcast DataFrame with required columns.

    Returns:
    - pd.DataFrame: The processed DataFrame with new columns and cleaned data.
    """
    # Add 'yR' column
    df['yR'] = 60.5 - df['release_extension']

    # Calculate 'tR'
    df['tR'] = (-df['vy0'] - np.sqrt(df['vy0']**2 - 2 * df['ay'] * (50 - df['yR']))) / df['ay']

    # Calculate 'vxR', 'vyR', 'vzR'
    df['vxR'] = df['vx0'] + df['ax'] * df['tR']
    df['vyR'] = df['vy0'] + df['ay'] * df['tR']
    df['vzR'] = df['vz0'] + df['az'] * df['tR']

    # Calculate 'HRA' (Horizontal Release Angle) and 'VRA' (Vertical Release Angle)
    df['HRA'] = np.arctan2(df['vyR'], df['vxR'])
    df['VRA'] = np.arctan2(df['vyR'], df['vzR'])

    # Convert angles from radians to degrees and adjust by adding 90 degrees
    df['HRA_deg'] = np.degrees(df['HRA']) + 90
    df['VRA_deg'] = np.degrees(df['VRA']) + 90

    # Drop rows where any of ['pitch_type', 'vxR', 'vyR', 'vzR'] have null values
    df_cleaned = df.dropna(subset=['pitch_type', 'vxR', 'vyR', 'vzR'])

    return df_cleaned


def get_approach_angle(df):
    y0 = 50
    yf = 17/12
    df['vy_f'] = -np.sqrt(df['vy0']**2 - (2 * df['ay'] * (y0 - yf)))
    df['t'] = (df['vy_f'] - df['vy0']) / df['ay']

    df['vz_f'] = df['vz0'] + (df['az'] * df['t'])
    df['vx_f'] = df['vx0'] + (df['ax'] * df['t'])

    df['VAA'] = -np.arctan(df['vz_f'] / df['vy_f']) * (180 / np.pi)
    df['HAA'] = -np.arctan(df['vx_f'] / df['vy_f']) * (180 / np.pi)

    return df


## Scaling into Stuff+ Functions

In [None]:
def stuff_by_pitch_type(df):
    # df['cdrv_288_100'] = df['cdrv_288'] * 100
    # df['cdrv_288_scaled'] = abs(df['cdrv_288_100'] - df['cdrv_288_100'].max())
    df['rve_288_scaled'] = df.groupby('pitch_type')['rve_288'].transform(lambda x: abs(x - x.max()))
    df['rve_288_100'] = df['rv_288'] * 100
    df['stuff_plus'] = (df['rve_288_scaled'] / df['rve_288_scaled'].mean()) * 100
    return df



def re288_by_100scale(df):
    df['rve_288_100'] = df['rve_288'] * 100
    df['rve_288_scaled'] = abs(df['rve_288_100'] - df['rve_288_100'].max())
    df['stuff_plus'] = (df['rve_288_scaled'] / df['rve_288_scaled'].mean()) * 100
    return df

## Primary Pitch related Functions

In [None]:
def find_primary_pitch(df):
    # Initialize a list to hold the result
    result = []

    # Process left-handed batters (is_lhb == 1)
    if (df['is_lhb'] == 1).any():
        # Group by pitcher_id and pitch_type, and count occurrences
        pitch_count_lhb = df[df['is_lhb'] == 1].groupby(['pitcherid', 'pitch_type']).size().reset_index(name='pitch_count')
        # Find the pitch type with the maximum count for each pitcher
        primary_pitch_lhb = pitch_count_lhb.loc[pitch_count_lhb.groupby('pitcherid')['pitch_count'].idxmax()]
        # Store pitcher_id, primary pitch for left-handed batters
        primary_pitch_lhb['against'] = 'lhb'
        result.append(primary_pitch_lhb[['pitcherid', 'pitch_type', 'against']])

    # Process right-handed batters (is_lhb == 0)
    if (df['is_lhb'] == 0).any():
        # Group by pitcher_id and pitch_type, and count occurrences
        pitch_count_rhb = df[df['is_lhb'] == 0].groupby(['pitcherid', 'pitch_type']).size().reset_index(name='pitch_count')
        # Find the pitch type with the maximum count for each pitcher
        primary_pitch_rhb = pitch_count_rhb.loc[pitch_count_rhb.groupby('pitcherid')['pitch_count'].idxmax()]
        # Store pitcher_id, primary pitch for right-handed batters
        primary_pitch_rhb['against'] = 'rhb'
        result.append(primary_pitch_rhb[['pitcherid', 'pitch_type', 'against']])

    # Combine the results for both handedness
    combined_result = pd.concat(result)

    # Pivot the data to have 'against_rhb' and 'against_lhb' as columns
    df_result = combined_result.pivot(index='pitcherid', columns='against', values='pitch_type')

    return df_result

In [None]:
def get_diff_from_primary_pitch(df):
    # Get the pitch counts of each pitcher by their pitch type and batter handedness
    pp_count = df.groupby(['pitcherid', 'pitch_type','is_lhb']).size().reset_index(name='pitch_count')

    # Get the primary pitch for each pitcher vs batter handedness (pitch with max count)
    pp_df = pp_count.loc[pp_count.groupby(['pitcherid','is_lhb'])['pitch_count'].idxmax()]

    pp_df['pitcher_lhb_key'] = list(zip(pp_df['pitcherid'], pp_df['is_lhb']))
    df['pitcher_lhb_key'] = list(zip(df['pitcherid'], df['is_lhb']))
    primary_pitch_dict = dict(zip(pp_df['pitcher_lhb_key'], pp_df['pitch_type']))

    # Map the primary pitch to the original DataFrame
    df['primary_pitch'] = df['pitcher_lhb_key'].map(primary_pitch_dict)

    grouped_data = df.groupby(['pitcherid', 'pitch_type','is_lhb']).agg({
        'release_speed': 'mean',
        'IVB': 'mean',
        'HB': 'mean'
    }).reset_index()

    # Merge the average metrics of the primary pitch back to the original DataFrame
    df = df.merge(grouped_data, left_on=['pitcherid', 'primary_pitch', 'is_lhb'], right_on=['pitcherid', 'pitch_type', 'is_lhb'],suffixes=('', '_avg'))

    # Calculate the differences between current pitch and primary pitch metrics
    df['diff_velo'] = df['release_speed'] - df['release_speed_avg']
    df['diff_HB'] = df['HB'] - df['HB_avg']
    df['diff_IVB'] = df['IVB'] - df['IVB_avg']

    return df


def get_diff_from_pitch_type(df):
    # # Get the pitch counts for each pitcher and pitch type
    # pt_count = df.groupby(['pitch_type']).size().reset_index(name='pitch_count')

    # Group by pitcherid and primary pitch to get the average metrics
    grouped_data = df.groupby(['pitch_type']).agg({
        'release_speed': 'mean',
        'IVB': 'mean',
        'HB': 'mean'
    }).reset_index()

    df = df.merge(grouped_data, on='pitch_type', suffixes=('', '_mean'))

    # Step 6: Calculate the differences between current pitch and primary pitch metrics
    df['diff_release_speed'] = df['release_speed'] - df['release_speed_mean']
    df['diff_IVB'] = df['IVB'] - df['IVB_mean']
    df['diff_HB'] = df['HB'] - df['HB_mean']

    return df

# Plot Functions

In [None]:
def plot_pitch_distribution(df, pitcher_id):
    # Filter for the given pitcher and split by is_lhb (0 for right-handed, 1 for left-handed)
    pitcher_data = df[df['pitcherid'] == pitcher_id]
    name = pitcher_data['pitcher_name'].iloc[0]

    # Split the data for right-handed (is_lhb=0) and left-handed batters (is_lhb=1)
    right_handed_data = pitcher_data[pitcher_data['is_lhb'] == 0]
    left_handed_data = pitcher_data[pitcher_data['is_lhb'] == 1]

    # Aggregate the total pitch counts by pitch type
    right_pitch_counts = right_handed_data.groupby('pitch_type')['pitch_count'].sum()
    left_pitch_counts = left_handed_data.groupby('pitch_type')['pitch_count'].sum()

    # Function to show count and percentage in pie chart
    def autopct_format(pct, values):
        total = sum(values)
        absolute = int(round(pct * total / 100))
        return "{:.1f}%\n({:d})".format(pct, absolute)

    # Plot for right-handed batters
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.pie(right_pitch_counts, labels=right_pitch_counts.index, autopct=lambda pct: autopct_format(pct, right_pitch_counts), startangle=90)
    plt.title(f'{name} - Right-handed Batters (is_lhb=0)')

    # Plot for left-handed batters
    plt.subplot(1, 2, 2)
    plt.pie(left_pitch_counts, labels=left_pitch_counts.index, autopct=lambda pct: autopct_format(pct, left_pitch_counts), startangle=90)
    plt.title(f'{name} - Left-handed Batters (is_lhb=1)')

    plt.tight_layout()
    plt.show()



def plot_hexbin(df, x_axis, y_axis, player_id=None, pitch_type=None, pitch_bucket=None, gridsize=40):

    name = None
    if player_id is not None:
        name = df.loc[df['pitcherid'] == player_id, 'pitcher_name'].iloc[0]
        if pitch_type is not None:
            df_against_righties = df.loc[(df['pitch_type'] == pitch_type) & (df['is_lhb'] == 0) & (df['pitcherid'] == player_id)]
            df_against_lefties = df.loc[(df['pitch_type'] == pitch_type) & (df['is_lhb'] == 1) & (df['pitcherid'] == player_id)]
            titles = ['vs RHB', 'vs LHB']
            label = pitch_type
            datasets = [df_against_righties, df_against_lefties]
        elif pitch_bucket is not None:
            df_against_righties = df.loc[(df['pitch_bucket'] == pitch_bucket) & (df['is_lhb'] == 0) & (df['pitcherid'] == player_id)]
            df_against_lefties = df.loc[(df['pitch_bucket'] == pitch_bucket) & (df['is_lhb'] == 1) & (df['pitcherid'] == player_id)]
            titles = ['vs RHB', 'vs LHB']
            label = pitch_bucket
            datasets = [df_against_righties, df_against_lefties]
    else:
        if pitch_type is not None:
            df00 = df.loc[(df['pitch_type'] == pitch_type) & (df['is_lhp'] == 0) & (df['is_lhb'] == 0)]
            df01 = df.loc[(df['pitch_type'] == pitch_type) & (df['is_lhp'] == 0) & (df['is_lhb'] == 1)]
            df10 = df.loc[(df['pitch_type'] == pitch_type) & (df['is_lhp'] == 1) & (df['is_lhb'] == 0)]
            df11 = df.loc[(df['pitch_type'] == pitch_type) & (df['is_lhp'] == 1) & (df['is_lhb'] == 1)]
            titles = ['00', '01', '10', '11']
            datasets = [df00, df01, df10, df11]
            label = pitch_type
        elif pitch_bucket is not None:
            df00 = df.loc[(df['pitch_bucket'] == pitch_bucket) & (df['is_lhp'] == 0) & (df['is_lhb'] == 0)]
            df01 = df.loc[(df['pitch_bucket'] == pitch_bucket) & (df['is_lhp'] == 0) & (df['is_lhb'] == 1)]
            df10 = df.loc[(df['pitch_bucket'] == pitch_bucket) & (df['is_lhp'] == 1) & (df['is_lhb'] == 0)]
            df11 = df.loc[(df['pitch_bucket'] == pitch_bucket) & (df['is_lhp'] == 1) & (df['is_lhb'] == 1)]
            titles = ['00', '01', '10', '11']
            datasets = [df00, df01, df10, df11]
            label = pitch_bucket
        else:
            raise ValueError("You must provide either a pitch_type or pitch_bucket.")

    rows, cols = 1, len(datasets)
    fig, axs = plt.subplots(rows, cols, figsize=(cols * 7, 6))
    axs = axs.ravel()  # Flatten the 2D array of axes

    for i, (data, title) in enumerate(zip(datasets, titles)):
        x = data[x_axis]
        y = data[y_axis]
        hb = axs[i].hexbin(x, y, gridsize=gridsize, cmap='Blues')

        # Check if the name exists and is not None
        if name:
            axs[i].set_title(f'{label}: {name} {title}')
        else:
            axs[i].set_title(f'{label}: {title}')

        axs[i].set_xlabel(x_axis)
        axs[i].set_ylabel(y_axis)
        fig.colorbar(hb, ax=axs[i], label='Count (# pitches)')

    plt.tight_layout()
    plt.show()



def plot_hexbin_pitch_bucket(datasets, x_axis, y_axis, titles, pitch_bucket):
    """
    Plots a 3x4 grid of hexbin plots for pitch datasets.

    Parameters:
    - datasets: list of DataFrames to plot (e.g., fastball00, fastball01, etc.).
    - titles: list of titles for each subplot.
    - pitch_bucket: string representing the group of pitches (e.g., 'Fastballs', 'Breaking', 'Offspeed').
    """
    fig, axs = plt.subplots(1, 4, figsize=(30, 6))
    axs = axs.ravel()  # Flatten the 2D array of axes to a 1D array

    for i, (data, title) in enumerate(zip(datasets, titles)):
        x = data[x_axis]  # Horizontal Break
        y = data[y_axis]  # Induced Vertical Break
        hb = axs[i].hexbin(x, y, gridsize=30, cmap='Blues')  # Hexbin plot for each dataset
        axs[i].set_title(f'{pitch_bucket}: {title}')
        axs[i].set_xlabel(x_axis)
        axs[i].set_ylabel(y_axis)
        fig.colorbar(hb, ax=axs[i], label='Count (pitches)')

    plt.tight_layout()
    plt.show()