pybaseball github with install instructions: https://github.com/jldbc/pybaseball <br>
Documentation for pybaseball library: https://github.com/jldbc/pybaseball/tree/master/docs <br>
Statcast header meanings: https://baseballsavant.mlb.com/csv-docs

plate_x
Horizontal position of the ball when it crosses home plate from the catcher's perspective.

plate_z
Vertical position of the ball when it crosses home plate from the catcher's perspective.

In [1]:
import pybaseball
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
def player(first_name, last_name):  
    player_info = pybaseball.playerid_lookup(last_name, first_name)
    if (player_info['mlb_played_last'][0] - player_info['mlb_played_first'][0]) < 10:
        start_year = int(player_info['mlb_played_first'][0])
    else:
        start_year = int(player_info['mlb_played_last'][0] - 10)
        
    player_id = player_info['key_mlbam'][0]
    player_info = [player_id,start_year,str(player_info['mlb_played_last'][0])]
    
    data = pybaseball.statcast_batter(start_dt = str(player_info[1]) + '-01-01', end_dt = '2021-12-01', player_id = player_info[0])
    data = data.reset_index(drop = True)
    return data

In [3]:
# home_team = ['brett gardner', 'clint frazier', 'aaron judge', 'matt holliday', 'didi gregorius',
#              'chase headley', 'todd frazier', 'tyler wade', 'austin romine']
home_team = ['brett gardner']
athlete = pd.DataFrame()
for x in home_team:
    name = x.split()
    print(name)
    athlete = athlete.append(player(name[0],name[1]))

['brett', 'gardner']
Gathering player lookup table. This may take a moment.
Gathering Player Data


  result = func(*args, **kwargs)


Trying to just predict outcome of match up not what the outcome of each pitch will be.

In [4]:
def predict_atbat(athlete):
    drop2 = ['batter','pitch_type','game_date', 'spin_dir', 'balls', 'strikes',
        'spin_rate_deprecated', 'player_name', 'zone', 'des',
       'game_type', 'stand',  'home_team', 'away_team', 'type',
        'game_year', 'pfx_x', 'description', 'bb_type', 'hit_location',
       'pfx_z', 'plate_x', 'plate_z', 'hc_x', 'hc_y',
        'fielder_2', 'umpire', 'sv_id','tfs_deprecated', 'tfs_zulu_deprecated',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',
       'woba_value', 'woba_denom', 'babip_value', 'iso_value',
        'launch_speed_angle',  'pitch_name','release_pos_x', 'release_pos_z',
        'bat_score', 'fld_score', 'post_away_score','delta_home_win_exp', 'delta_run_exp',
        'post_home_score', 'post_bat_score', 'post_fld_score',
        'break_angle_deprecated', 'break_length_deprecated','release_speed',
        'spin_axis','at_bat_number', 'if_fielding_alignment', 'of_fielding_alignment']

    events_change = athlete.copy()
    events_change = events_change.fillna(0)
    outcomes = events_change.loc[events_change['events'] != 0]
    outcomes = outcomes.reset_index(drop = True)
    outcomes = outcomes.drop(columns = drop2)
    outcomes['on_1b'] = np.where(outcomes['on_1b'] < 1, "0", "1")
    outcomes['on_2b'] = np.where(outcomes['on_2b'] < 1, "0", "1")
    outcomes['on_3b'] = np.where(outcomes['on_3b'] < 1, "0", "1")
    outcomes['p_throws'] = np.where(outcomes['p_throws'] == 'R', "0", "1")
    outcomes['inning_topbot'] = np.where(outcomes['inning_topbot'] == 'Top', "0", "1")

    events = Counter(outcomes['events'])
    items = events.items()
    total = len(outcomes['events'])
    event_outcomes = list(Counter(outcomes['events']))
    event_percentages = []

    for i in items:
        event_percentages.append((i[1]/total)*100)
    for x in range(len(event_outcomes)):
        outcomes[event_outcomes[x]] = event_percentages[x]
        
    
    outcomes_X = outcomes.copy()
    outcomes_X = outcomes_X.drop(columns = ['events'])
    outcomes_Y = outcomes['events']

    
    gnb = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(outcomes_X, outcomes_Y, test_size=0.5, random_state=0)

    y_pred = gnb.fit(X_train, y_train).predict(X_test)

    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
    print("Accuracy : %d"  %(((X_test.shape[0] - (y_test != y_pred).sum()) / X_test.shape[0] )*100))
#     print(y_test)
#     return Counter(y_pred)

In [5]:
predict_atbat(athlete)

Number of mislabeled points out of a total 2897 points : 1731
Accuracy : 40


Trying predctions through probabilities

Prob. Hit happens for Brett Gardner vs. David Price

In [6]:
# Get and calculate the league batting avg. vs specific pitcher.
# In this case David Price
pitcher = pybaseball.statcast_pitcher(start_dt = '2018-01-01', end_dt = '2018-12-31', player_id = 456034)
pitcher['events'] = pitcher['events'].fillna(0)
pitcher = pitcher.loc[pitcher['events'] != 0]
pitcher = pitcher.loc[pitcher['events'] != 'walk']
pitcher = pitcher.loc[pitcher['events'] != 'hit_by_pitch']
at_bats = Counter(pitcher['events'])
at_bat_keys = at_bats.keys()
hits = 0
AB = 0
for x in at_bat_keys:
    if x == 'single' or x == 'double' or x == 'triple' or x == 'home_run':
        hits+=at_bats[x]
    AB+=at_bats[x]

league_pitcher_BA = np.round(hits/AB,3)

# Get league Batting Avg. for players who had at bats in a min. of a quarter of the season
league_data = pybaseball.batting_stats_bref(2018)
valid_players = league_data.loc[league_data['G'] >= 41]
league_BA = np.mean(valid_players['BA'])

# get specific player batting avg.
# In this case Brett Gardner
player = league_data.loc[league_data['Name'] == 'Brett Gardner']

# Rounding the batting avg. to the standard accepted 3 decimal places
player_BA = np.round(list(player['BA'])[0],3)
league_BA = np.round(league_BA,3)

Gathering Player Data


  table = table.drop('', 1)


Calculate the prob. a hit happens

In [7]:
((player_BA*league_pitcher_BA)/league_BA) / (((player_BA*league_pitcher_BA)/league_BA) + ((1-player_BA)*(1-league_pitcher_BA))/(1-league_BA))

0.21463972773876708

Made it through equation #2 in: <br> https://sabr.org/journal/article/matchup-probabilities-in-major-league-baseball/