pybaseball github with install instructions: https://github.com/jldbc/pybaseball <br>
Documentation for pybaseball library: https://github.com/jldbc/pybaseball/tree/master/docs <br>
Statcast header meanings: https://baseballsavant.mlb.com/csv-docs

plate_x
Horizontal position of the ball when it crosses home plate from the catcher's perspective.

plate_z
Vertical position of the ball when it crosses home plate from the catcher's perspective.

In [1]:
import pybaseball
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
def player(first_name, last_name):  
    player_info = pybaseball.playerid_lookup(last_name, first_name)
    if (player_info['mlb_played_last'][0] - player_info['mlb_played_first'][0]) < 10:
        start_year = int(player_info['mlb_played_first'][0])
    else:
        start_year = int(player_info['mlb_played_last'][0] - 10)
        
    player_id = player_info['key_mlbam'][0]
    player_info = [player_id,start_year,str(player_info['mlb_played_last'][0])]
    
    data = pybaseball.statcast_batter(start_dt = str(player_info[1]) + '-01-01', end_dt = '2021-12-01', player_id = player_info[0])
    data = data.reset_index(drop = True)
    return data

In [3]:
def ball_and_strike(game_data):
    balls = pd.DataFrame(columns = ['pitch_type', 'description'])
    strikes = pd.DataFrame(columns = ['pitch_type', 'description'])
    live = pd.DataFrame(columns = ['pitch_type', 'description'])
    k = 0
    for x in range(len(game_data)):
        if game_data['description'][x] == 'ball':
            details = game_data[['pitcher', 'pitch_type','description', 'plate_x', 'plate_z']].iloc[[x]]
            balls = balls.append(details)
        elif game_data['description'][x] == 'hit_into_play':
            details = game_data[['pitcher', 'pitch_type','description', 'plate_x', 'plate_z']].iloc[[x]]
            live = live.append(details)
        else:
            details = game_data[['pitcher', 'pitch_type','description', 'plate_x', 'plate_z']].iloc[[x]]
            strikes = strikes.append(details)
    return balls, strikes, live

In [4]:
# home_team = ['brett gardner', 'clint frazier', 'aaron judge', 'matt holliday', 'didi gregorius',
#              'chase headley', 'todd frazier', 'tyler wade', 'austin romine']
home_team = ['brett gardner']

In [5]:
for x in home_team:
    name = x.split()
    print(name)
    athlete = player(name[0],name[1])

['brett', 'gardner']
Gathering player lookup table. This may take a moment.
Gathering Player Data


  result = func(*args, **kwargs)


In [7]:
pybaseball.playerid_reverse_lookup([592332], key_type='mlbam')

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,gausman,kevin,592332,gausk001,gausmke01,14107,2013.0,2021.0


Naive Bayes Predictor for each pitch of a match up.

In [9]:
def predict_pitch(athlete)
    drop = ['pitch_type','game_date', 'spin_dir', 'spin_rate_deprecated', 'player_name', 'events', 'zone', 'des',
       'game_type', 'stand',  'home_team', 'away_team', 'type',
       'hit_location', 'bb_type',  'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z',  'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
        'launch_speed_angle',  'pitch_name','release_pos_x', 'release_pos_z',
        'bat_score', 'fld_score', 'post_away_score',
        'post_home_score', 'post_bat_score', 'post_fld_score',
        'break_angle_deprecated', 'break_length_deprecated','release_speed',
        'spin_axis','delta_home_win_exp', 'delta_run_exp','at_bat_number', 'if_fielding_alignment', 'of_fielding_alignment']
    
    example = athlete.copy()
    example = example.drop(columns = drop)
    example = example.fillna(0)
    example = example.reindex(index=example.index[::-1])
    example = example.reset_index(drop=True)

    test = example.copy()
    test['on_1b'] = np.where(test['on_1b'] < 1, "0", "1")
    test['on_2b'] = np.where(test['on_2b'] < 1, "0", "1")
    test['on_3b'] = np.where(test['on_3b'] < 1, "0", "1")
    test['inning_topbot'] = np.where(test['inning_topbot'] == 'Top', "0", "1")
    test['p_throws'] = np.where(test['p_throws'] == 'R', "0", "1")
    test_X = test.drop(columns = ['description'])
    test_Y = test['description']

    replace_val = {'ball': 0,
             'called_strike': 1,
             'foul': 1,
             'hit_into_play': 2,
             'swinging_strike': 1,
             'swinging_strike_blocked': 1,
             'blocked_ball': 0,
             'foul_bunt': 1,
             'foul_tip': 1,
             'hit_by_pitch': 0,
             'missed_bunt': 1,
             'pitchout': 0,
             'intent_ball': 0,
             'bunt_foul_tip': 1}
    test_Y = test_Y.replace(replace_val)

    X_train, X_test, y_train, y_test = train_test_split(test_X, test_Y, test_size=0.5, random_state=0)
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

In [10]:
predict_pitch(athlete)

Trying to just predict outcome of match up not what the outcome of each pitch will be.

In [13]:
def predict_atbat(athlete):
    drop2 = ['inning_topbot', 'balls', 'strikes', 'pitch_number', 'pitch_type','game_date', 'spin_dir', 
        'spin_rate_deprecated', 'player_name', 'zone', 'des',
       'game_type', 'stand',  'home_team', 'away_team', 'type',
       'hit_location', 'bb_type',  'game_year', 'pfx_x', 'description',
       'pfx_z', 'plate_x', 'plate_z',  'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
        'launch_speed_angle',  'pitch_name','release_pos_x', 'release_pos_z',
        'bat_score', 'fld_score', 'post_away_score',
        'post_home_score', 'post_bat_score', 'post_fld_score',
        'break_angle_deprecated', 'break_length_deprecated','release_speed',
        'spin_axis','delta_home_win_exp', 'delta_run_exp','at_bat_number', 'if_fielding_alignment', 'of_fielding_alignment']
    
    events_change = athlete.copy()
    events_change['events'] = events_change['events'].fillna(0)
    outcomes = events_change.loc[events_change['events'] != 0]
    outcomes = outcomes.reset_index(drop = True)
    outcomes = outcomes.drop(columns = drop2)
    outcomes['on_1b'] = np.where(outcomes['on_1b'] < 1, "0", "1")
    outcomes['on_2b'] = np.where(outcomes['on_2b'] < 1, "0", "1")
    outcomes['on_3b'] = np.where(outcomes['on_3b'] < 1, "0", "1")
    outcomes['p_throws'] = np.where(outcomes['p_throws'] == 'R', "0", "1")
    
    change_events = {'field_out': 0,
                 'single': 1,'strikeout': 0,'force_out': 0,'double': 1,
                 'walk': 1,'sac_bunt': 0,'field_error': 1,'triple': 1,
                 'hit_by_pitch': 1,'caught_stealing_2b': 0,'grounded_into_double_play': 0,
                 'sac_fly': 0,'double_play': 0,'home_run': 1,'fielders_choice': 1,
                 'strikeout_double_play': 0,'fielders_choice_out': 0,
                 'intent_walk': 1,'game_advisory': 1,'sac_fly_double_play': 0,
                 'other_out': 0}
    
    outcomes['events'] = outcomes['events'].replace(change_events)
    outcomes_X = outcomes.copy()
    outcomes_X = outcomes_X.drop(columns = ['events'])
    outcomes_Y = outcomes['events']
    
    X_train, X_test, y_train, y_test = train_test_split(outcomes_X, outcomes_Y, test_size=0.5, random_state=0)
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
    print("Accuracy : %d"  %(((X_test.shape[0] - (y_test != y_pred).sum()) / X_test.shape[0] )*100))

In [14]:
predict_atbat(athlete)

Number of mislabeled points out of a total 2897 points : 1006
Accuracy : 65
