pybaseball github with install instructions: https://github.com/jldbc/pybaseball <br>
Documentation for pybaseball library: https://github.com/jldbc/pybaseball/tree/master/docs <br>
Statcast header meanings: https://baseballsavant.mlb.com/csv-docs

plate_x
Horizontal position of the ball when it crosses home plate from the catcher's perspective.

plate_z
Vertical position of the ball when it crosses home plate from the catcher's perspective.

In [1]:
import pybaseball
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
def player(first_name, last_name):  
    player_info = pybaseball.playerid_lookup(last_name, first_name)
    if (player_info['mlb_played_last'][0] - player_info['mlb_played_first'][0]) < 10:
        start_year = int(player_info['mlb_played_first'][0])
    else:
        start_year = int(player_info['mlb_played_last'][0] - 10)
        
    player_id = player_info['key_mlbam'][0]
    player_info = [player_id,start_year,str(player_info['mlb_played_last'][0])]
    
    data = pybaseball.statcast_batter(start_dt = str(player_info[1]) + '-01-01', end_dt = '2021-12-01', player_id = player_info[0])
    data = data.reset_index(drop = True)
    return data

In [3]:
# home_team = ['brett gardner', 'clint frazier', 'aaron judge', 'matt holliday', 'didi gregorius',
#              'chase headley', 'todd frazier', 'tyler wade', 'austin romine']
home_team = ['brett gardner']
for x in home_team:
    name = x.split()
    print(name)
    athlete = player(name[0],name[1])

['brett', 'gardner']
Gathering player lookup table. This may take a moment.
Gathering Player Data


  result = func(*args, **kwargs)


Naive Bayes Predictor for each pitch of a match up.

In [4]:
def predict_pitch(athlete):
    drop = ['pitch_type','game_date', 'spin_dir', 'spin_rate_deprecated', 'player_name', 'events', 'zone', 'des',
       'game_type', 'stand',  'home_team', 'away_team', 'type',
       'hit_location', 'bb_type',  'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z',  'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
        'launch_speed_angle',  'pitch_name','release_pos_x', 'release_pos_z',
        'bat_score', 'fld_score', 'post_away_score',
        'post_home_score', 'post_bat_score', 'post_fld_score',
        'break_angle_deprecated', 'break_length_deprecated','release_speed',
        'spin_axis','delta_home_win_exp', 'delta_run_exp','at_bat_number', 'if_fielding_alignment', 'of_fielding_alignment']
    
    example = athlete.copy()
    example = example.drop(columns = drop)
    example = example.fillna(0)
    example = example.reindex(index=example.index[::-1])
    example = example.reset_index(drop=True)

    test = example.copy()
    test['on_1b'] = np.where(test['on_1b'] < 1, "0", "1")
    test['on_2b'] = np.where(test['on_2b'] < 1, "0", "1")
    test['on_3b'] = np.where(test['on_3b'] < 1, "0", "1")
    test['inning_topbot'] = np.where(test['inning_topbot'] == 'Top', "0", "1")
    test['p_throws'] = np.where(test['p_throws'] == 'R', "0", "1")
    test_X = test.drop(columns = ['description'])
    test_Y = test['description']

    X_train, X_test, y_train, y_test = train_test_split(test_X, test_Y, test_size=0.5, random_state=0)
    gnb = GaussianNB()
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
    print("Accuracy : %d"  %(((X_test.shape[0] - (y_test != y_pred).sum()) / X_test.shape[0] )*100))

In [5]:
predict_pitch(athlete)

Number of mislabeled points out of a total 12344 points : 7811
Accuracy : 36


Trying to just predict outcome of match up not what the outcome of each pitch will be.

In [10]:
def predict_atbat(athlete, user = 'self'):
    drop2 = ['pitch_type','game_date', 'spin_dir', 'balls', 'strikes',
        'spin_rate_deprecated', 'player_name', 'zone', 'des',
       'game_type', 'stand',  'home_team', 'away_team', 'type',
        'game_year', 'pfx_x', 'description', 'bb_type', 'hit_location',
       'pfx_z', 'plate_x', 'plate_z', 'hc_x', 'hc_y',
        'fielder_2', 'umpire', 'sv_id','tfs_deprecated', 'tfs_zulu_deprecated',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
        'launch_speed_angle',  'pitch_name','release_pos_x', 'release_pos_z',
        'bat_score', 'fld_score', 'post_away_score','delta_home_win_exp', 'delta_run_exp',
        'post_home_score', 'post_bat_score', 'post_fld_score',
        'break_angle_deprecated', 'break_length_deprecated','release_speed',
        'spin_axis','at_bat_number', 'if_fielding_alignment', 'of_fielding_alignment']

    events_change = athlete.copy()
    events_change = events_change.fillna(0)
    outcomes = events_change.loc[events_change['events'] != 0]
    outcomes = outcomes.reset_index(drop = True)
    outcomes = outcomes.drop(columns = drop2)
    outcomes['on_1b'] = np.where(outcomes['on_1b'] < 1, "0", "1")
    outcomes['on_2b'] = np.where(outcomes['on_2b'] < 1, "0", "1")
    outcomes['on_3b'] = np.where(outcomes['on_3b'] < 1, "0", "1")
    outcomes['p_throws'] = np.where(outcomes['p_throws'] == 'R', "0", "1")
    outcomes['inning_topbot'] = np.where(outcomes['inning_topbot'] == 'Top', "0", "1")

    outcomes_X = outcomes.copy()
    outcomes_X = outcomes_X.drop(columns = ['events'])
    outcomes_Y = outcomes['events']

    
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(outcomes_X, outcomes_Y, test_size=0.5, random_state=0)
#     X_train = outcomes_X
#     y_train = outcomes_Y
#     X_test = user

    y_pred = gnb.fit(X_train, y_train).predict(X_test)

    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
    print("Accuracy : %d"  %(((X_test.shape[0] - (y_test != y_pred).sum()) / X_test.shape[0] )*100))
#     return y_pred

In [15]:
predict_atbat(athlete,10)

Number of mislabeled points out of a total 2897 points : 1739
Accuracy : 39


In [8]:
# import random

# results = []
# for x in range(100):
#     inning = random.randint(1,9)
#     third_base = random.randint(0,1)
#     second_base = random.randint(0,1)
#     first_base = random.randint(0,1)
#     outs = random.randint(0,2)
#     home_score = random.randint(0,4)
#     away_score = random.randint(0,4)
#     pitch_num = random.randint(0,6)
#     top = random.randint(0,1)

#     #     ,'home_score':[home_score],'away_score':[away_score]

#     data = {'batter' : ['458731'],'pitcher':['592332'], 'p_throws':['0'],
#             'on_3b' :[third_base], 'on_2b':[second_base], 'on_1b':[first_base],
#             'outs_when_up':[outs],'inning' :[inning], 'inning_topbot' : [top],
#             'pitch_number':[pitch_num], 'home_score' : [home_score], 'away_score': [away_score]}

#     user = pd.DataFrame(data)
#     results.append(predict_atbat(athlete,user)[0])
# print(Counter(results))