In [9]:
!pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.5.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl.metadata (8.7 kB)
Collecting Deprecated (from pygithub>=1.51->pybaseball)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
Downloading PyGithub-2.5.0-py3-none-any.whl (375 kB)
Downloading PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl (349 kB)
Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: Deprecated, pynacl, pygithub, pybaseball
Successfully installed Deprecated-1.2.14 pybaseball-2.2.7 pygithub-2.5.0 pynacl-1.5.0


In [36]:
import os 
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [64]:
START = 2002
END = 2022

In [66]:
batting = batting_stats(START, END, qual=200) # qual is how many min plate appearances we want a batter to have

In [67]:
batting.to_csv("/Users/dhruth/Desktop/WAR_prediction_project/batting.csv")

In [68]:
# remove players that we only have 1 season of data for
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)
                                                      # the filter keeps groups where we have at least 2 seasons of data for the player

In [69]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
6535,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0.0,,0,0.147,0.258,,,,-2.6
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


## Creating a target

In [75]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

  batting = batting.groupby("IDfg", group_keys=False).apply(next_season)


In [79]:
# let's explore what the shift is doing
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5562,Alfredo Amezaga,2006,1.1,2.0
5006,Alfredo Amezaga,2007,2.0,1.2
5252,Alfredo Amezaga,2008,1.2,
1169,Garret Anderson,2002,3.7,5.1
864,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
6002,Owen Miller,2022,0.7,
4881,Andrew Vaughn,2021,-0.2,-0.5
3377,Andrew Vaughn,2022,-0.5,
6620,Ha-seong Kim,2021,0.4,3.6


## Cleaning the data

In [84]:
# getting rid of null values
null_count = batting.isnull().sum()

In [86]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6754
xSLG        6754
xwOBA       6754
L-WAR          0
Next_WAR    1179
Length: 321, dtype: int64

In [88]:
# we are only selecting the columns with null count 0
complete_cols = list(batting.columns[null_count == 0])

In [90]:
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'UBR',
 'Age Rng',
 'Off',
 'Lg',
 'wGDP',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+',
 'BB%+',
 'K

In [100]:
# here we are taking our abtting df and selecting all of the complete columns in the df PLUS our Next_WAR
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [102]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
5562,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
5006,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
5252,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
1169,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,3.7,5.1
864,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,5.1,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,24655,2022,Owen Miller,CLE,25,130,424,472,103,70,...,111,97,131,100,83,340,0.188,0.266,0.7,
4881,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,104,116,84,99,110,321,0.185,0.285,-0.4,-0.5
3377,26197,2022,Andrew Vaughn,CHW,24,134,510,555,138,92,...,106,111,94,100,104,419,0.201,0.291,-0.5,
6620,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,99,59,137,96,88,201,0.216,0.303,0.5,3.6


In [104]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 133, dtype: object

In [106]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [108]:
batting.Dol

5562      $5.5
5006     $11.2
5252      $7.2
1169     $14.6
864      $22.0
         ...  
6002      $5.7
4881    ($1.6)
3377    ($4.0)
6620      $3.0
4396     $29.2
Name: Dol, Length: 6754, dtype: object

In [110]:
del batting["Dol"]

In [112]:
batting["Age Rng"]

5562    28 - 28
5006    29 - 29
5252    30 - 30
1169    30 - 30
864     31 - 31
         ...   
6002    25 - 25
4881    23 - 23
3377    24 - 24
6620    25 - 25
4396    26 - 26
Name: Age Rng, Length: 6754, dtype: object

In [114]:
del batting["Age Rng"]

In [283]:
# creaating a column to convert team names into numbers we can use in our model
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [285]:
batting_full = batting.copy()
batting = batting.dropna().copy()

## Useful feature selection

In [288]:
# let's run a feature selector that can pick a subset of features
# this helps a model optimize its accuracy and reduces overfitting or multicolinearity
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

# higher alpha reduces overfitting as it penalizes the rr coefficients
# lower is closer to a pure regular linear regeression
rr = Ridge(alpha = 1)

# splits our data up into 3 parts and make predictions for those parts
split = TimeSeriesSplit(n_splits = 3)

# a forward direction means it's going to start at 0 features and keep going until it finds the best one,
# then repeat the process until it has 20
sfs = SequentialFeatureSelector(rr, n_features_to_select = 20, direction = "forward", cv = split, n_jobs = 4)

In [290]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]

# this is saying take all the columns in batting, then pick all of the columns that are NOT in the list of our removed columns 
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [292]:
# let's scale our data to get a mean of 0 and a s.d of 1 in order for the model to work effectively 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

  batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])


In [294]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code,player_season,war_corr,war_diff,war_season
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,0.103459,...,0.172991,0.498932,0.545898,0.32204,1.79313,0.474128,0.171729,0.519349,0.040936,0.324167
std,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,0.105891,...,0.273858,0.13718,0.120701,0.122153,1.981035,0.305105,0.161822,0.301989,0.023514,0.121909
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-3.1,0.0,0.0,0.0,0.0,0.0
25%,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,0.043478,...,0.0,0.408511,0.46696,0.234177,0.35,0.205882,0.055556,0.320207,0.03125,0.236615
50%,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,0.086957,...,0.0,0.493617,0.546256,0.303797,1.5,0.470588,0.111111,0.5,0.038462,0.304873
75%,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,0.130435,...,0.346411,0.591489,0.625551,0.392405,2.9,0.735294,0.277778,0.747458,0.04549,0.392408
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,11.9,1.0,1.0,1.0,1.0,1.0


In [296]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [298]:
# finding the features that were selected
predictors = list(selected_columns[sfs.get_support()])
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BABIP',
 'Spd',
 'PH',
 'CB%',
 'CH%',
 'Swing%',
 'wGDP',
 'ISO+',
 'LD+%',
 'Oppo%+',
 'Soft%+',
 'Hard%+',
 'war_corr',
 'war_diff',
 'war_season']

## Making predictions

In [301]:
# geenrates our predictions for us
def backtest(data, model, predictors, start = 5, step = 1):
    all_predictions = [] # each element in this list is going to be the predictions for a single season
    years = sorted(batting["Season"].unique())

    for i in range(start, len(years), step): # each time through this loop, we are going to use historizal data to predict a single season
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [303]:
predictions = backtest(batting, rr, predictors)

In [305]:
predictions

Unnamed: 0,actual,prediction
5006,1.2,1.317902
1925,1.4,0.912777
3102,-0.1,0.688263
5797,0.6,1.080132
1109,4.8,2.447246
...,...,...
1914,2.2,2.578483
5875,0.8,2.040255
7032,0.7,1.531220
4881,-0.5,1.663971


In [307]:
# hard to tell if the algorithm is good, so let's use a summary statistic to create an error metric
from sklearn.metrics import mean_squared_error # this gives us a single number that tells us how high the error is in our model

mean_squared_error(predictions["actual"], predictions["prediction"])

2.6892660150410927

In [309]:
batting["Next_WAR"].describe()

count    5575.000000
mean        1.793130
std         1.981035
min        -3.100000
25%         0.350000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [311]:
# we like the sqrt of MSE to be lower than the s.d, indicates that the model is doing something better than randomly guessing
2.7363675228708013 ** 0.5

1.6541969419844789

## Let's improve prediction and accuracy

In [314]:
ga = batting[batting["IDfg"] == 2].copy()

In [316]:
ga["player_season"] = range(0, ga.shape[0]) # .shape[0] returns number of rows in the array/df 

In [318]:
ga[["player_season", "WAR"]].expanding().corr() # expanding creates different groups of the df

Unnamed: 0,Unnamed: 1,player_season,WAR
1169,player_season,,
1169,WAR,,
864,player_season,1.0,1.0
864,WAR,1.0,1.0
2569,player_season,1.0,-0.661143
2569,WAR,-0.661143,1.0
4187,player_season,1.0,-0.836562
4187,WAR,-0.836562,1.0
3964,player_season,1.0,-0.836312
3964,WAR,-0.836312,1.0


In [320]:
list(ga[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])

[nan,
 1.0,
 -0.6611430912519526,
 -0.8365619976685158,
 -0.8363121929961227,
 -0.6921918007562199,
 -0.5950132649769159]

In [None]:
# giving the algorithm some information on how the player did previously can help it make better predictions

def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] = range(0, df.shape[0]) # indicates which season it is for the player
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)

    df["war_diff"] = df["WAR"] / df["WAR"].shift(1) # difference between current WAR and prev season's WAR
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [326]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [328]:
batting["war_season"] = batting.groupby("Season", group_keys = False).apply(group_averages)

  batting["war_season"] = batting.groupby("Season", group_keys = False).apply(group_averages)


In [333]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [337]:
predictions = backtest(batting, rr, new_predictors)

In [339]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.683291500773334

In [344]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.610547
BABIP           -1.950268
Soft%+          -1.244190
ISO+            -1.170234
BU              -1.023677
PH              -0.785004
SO              -0.672380
wGDP            -0.341390
CB%             -0.325741
Swing%          -0.304232
LD+%            -0.277597
CH%             -0.223799
war_diff        -0.142285
war_diff        -0.142285
war_corr        -0.067776
war_corr        -0.067776
player_season    0.003110
Oppo%+           0.707870
Spd              0.788975
SB               1.083423
war_season       1.291036
war_season       1.291036
IBB              2.088754
Hard%+           2.372181
dtype: float64

In [346]:
diff = predictions["actual"] - predictions["prediction"]

In [348]:
merged = predictions.merge(batting, left_index=True, right_index=True)


In [350]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()


In [354]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"]).head(20)


Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
3352,746,2009,A.J. Pierzynski,0.341772,1.4,0.000539
1631,2396,2014,Carlos Santana,0.341772,2.1,0.001419
245,16376,2020,Michael Conforto,0.310127,1.5,0.001743
340,14162,2017,Carlos Correa,0.506329,3.7,0.001765
5459,6012,2019,Didi Gregorius,0.164557,0.7,0.001843
6814,4062,2018,Dexter Fowler,0.094937,0.5,0.001928
264,20043,2021,Luis Robert Jr.,0.417722,2.2,0.002284
2349,12564,2021,Trevor Story,0.335443,2.5,0.002661
1288,11368,2019,Yasmani Grandal,0.556962,3.6,0.003083
5715,13768,2016,Travis Jankowski,0.259494,1.4,0.003731
