In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [None]:
# ! DO NOT NEED TO RUN
START = 2002
END = 2022

batting = batting_stats(START, END, qual=200)
batting.to_csv("batting.csv")

In [None]:
# ! RUN FROM HERE
# above is getting the batting, but it is already in a csv

batting = pd.read_csv('batting.csv')

In [3]:
grouped_batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [4]:
grouped_batting

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,...,,,,0,0.127,0.191,,,,12.7
1,1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,...,,,,0,0.124,0.164,,,,11.9
2,8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.4
3,2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,...,,,,0,0.135,0.223,,,,10.2
4,15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7109,7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
7110,6535,45,2012,Rod Barajas,PIT,36,104,321,361,66,...,,0.0,,0,0.147,0.258,,,,-2.6
7111,6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,...,,0.0,,0,0.169,0.295,,,,-2.9
7112,6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,...,,,,0,0.130,0.187,,,,-2.9


In [5]:

# move next season war to row in prev season to use as labels for ML
def next_season(player):
	player = player.sort_values("Season")
	player["Next_WAR"] = player["WAR"].shift(-1)
	
	return player

batting_with_next_war = grouped_batting.groupby("IDfg", group_keys=False).apply(next_season)

  batting_with_next_war = grouped_batting.groupby("IDfg", group_keys=False).apply(next_season)


In [6]:
batting_with_next_war[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
3946,Alfredo Amezaga,2006,1.1,2.0
2607,Alfredo Amezaga,2007,2.0,1.2
3787,Alfredo Amezaga,2008,1.2,
1020,Garret Anderson,2002,3.7,5.1
424,Garret Anderson,2003,5.1,0.8
...,...,...,...,...
4667,Owen Miller,2022,0.7,
6108,Andrew Vaughn,2021,-0.2,-0.5
6450,Andrew Vaughn,2022,-0.5,
5238,Ha-seong Kim,2021,0.4,3.6


In [7]:
null_count = batting_with_next_war.isnull().sum()
null_count, batting_with_next_war.columns

(Unnamed: 0       0
 IDfg             0
 Season           0
 Name             0
 Team             0
               ... 
 xBA           6754
 xSLG          6754
 xwOBA         6754
 L-WAR            0
 Next_WAR      1179
 Length: 322, dtype: int64,
 Index(['Unnamed: 0', 'IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA',
        'H',
        ...
        'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA', 'xSLG',
        'xwOBA', 'L-WAR', 'Next_WAR'],
       dtype='object', length=322))

In [8]:
complete_cols = list(batting_with_next_war.columns[null_count == 0])
complete_cols

['Unnamed: 0',
 'IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'UBR',
 'Age Rng',
 'Off',
 'Lg',
 'wGDP',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+

In [9]:
batting = batting_with_next_war[complete_cols + ["Next_WAR"]].copy()
batting.dtypes

Unnamed: 0      int64
IDfg            int64
Season          int64
Name           object
Team           object
               ...   
Events          int64
CStr%         float64
CSW%          float64
L-WAR         float64
Next_WAR      float64
Length: 134, dtype: object

In [10]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [11]:
del batting["Dol"]

In [12]:
batting["Age Rng"]

3946    28 - 28
2607    29 - 29
3787    30 - 30
1020    30 - 30
424     31 - 31
         ...   
4667    25 - 25
6108    23 - 23
6450    24 - 24
5238    25 - 25
1128    26 - 26
Name: Age Rng, Length: 6754, dtype: object

In [13]:
del batting["Age Rng"]

In [14]:
batting["team_codes"] = batting["Team"].astype("category").cat.codes

In [15]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [16]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split,n_jobs=4)

In [17]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])
  batting.loc[:, selected_columns] = scaler.fit_transform(batting

In [25]:
batting.describe()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_codes
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,0.451542,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,...,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,0.322041,1.792969,0.474128
std,0.27945,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,...,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,0.122148,1.980831,0.305105
min,0.0,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.1,0.0
25%,0.209265,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.234177,0.4,0.205882
50%,0.431885,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.303797,1.5,0.470588
75%,0.681358,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,...,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,0.392405,2.9,0.735294
max,1.0,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.9,1.0


In [20]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [21]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
def backtest(data, model, predictors, start=5,step=1):
	all_predictions = [] # predictions in a single season ???

	years = sorted(data["Season"].unique())
	
	for i in range(start, len(years), step):
		current_year = years[i]
		
		train = data[data["Season"] < current_year]
		test = data[data["Season"] == current_year]

		model.fit(train[predictors], train["Next_WAR"])

		preds = model.predict(test[predictors])
		preds = pd.Series(preds, index=test.index)
		combined = pd.concat([test["Next_WAR"], preds], axis=1)
		combined.columns = ["actual", 'prediction']


		all_predictions.append(combined)
	return pd.concat(all_predictions)
		

In [33]:
predictions = backtest(batting, rr, predictors)

In [34]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.7342013213217045

In [35]:
batting["Next_WAR"].describe()

count    5575.000000
mean        1.792969
std         1.980831
min        -3.100000
25%         0.400000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [55]:
def player_history(df):
	df = df.sort_values("Season")
    
	df["player_season"] = range(0, df.shape[0])
	# correlation between wars
	df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
	df["war_corr"] = df["war_corr"].fillna(1)
	# ratio between current war and prev season war
	df["war_diff"] = df["WAR"] / df["WAR"].shift(1) # shift 1 brings prev season value up to curr
	df["war_diff"] = df["war_diff"].fillna(1)

	df.loc[df["war_diff"] == np.inf, ["war_diff"]] = 1

	return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

  batting = batting.groupby("IDfg", group_keys=False).apply(player_history)


In [56]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [57]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

  batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)


In [58]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [59]:
predictions = backtest(batting, rr, new_predictors)

In [60]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.6783293216652155

In [64]:
# looking at the importance of each predictor, farther from 0 coeff means more important(i think, might just be larger)
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.674781
BABIP           -1.897843
WAR             -1.850495
Soft%+          -1.325737
BU              -1.114548
SLG+            -1.068448
SO              -0.889178
PH              -0.757730
wCH             -0.307237
CB%             -0.299943
war_diff        -0.282354
CH%             -0.279926
Pull%+          -0.174824
war_corr        -0.137182
player_season   -0.002287
Unnamed: 0       0.319975
IFH              0.652026
Oppo%            0.722217
Spd              0.752715
SB               0.964739
OBP+             1.092136
IBB              1.686528
Hard%+           2.454632
war_season       3.167113
dtype: float64

In [65]:
diff = predictions["actual"] - predictions["prediction"]

In [66]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [69]:
# more diagnostics, see difference between prediction and actual
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()
merged

Unnamed: 0.1,actual,prediction,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,...,CStr%,CSW%,L-WAR,Next_WAR,team_codes,player_season,war_corr,war_diff,war_season,diff
2607,1.2,1.483980,0.703782,1,2007,Alfredo Amezaga,FLA,0.384615,0.743590,0.431655,...,0.527660,0.396476,0.322785,1.2,0.352941,1,1.000000,1.214286,0.998259,0.283980
3390,1.4,0.497613,0.270631,2,2007,Garret Anderson,LAA,0.615385,0.529915,0.462230,...,0.442553,0.480176,0.284810,1.4,0.441176,5,-0.692192,1.406250,0.880816,0.902387
4582,-0.1,0.210693,0.436103,10,2007,David Eckstein,STL,0.500000,0.606838,0.492806,...,0.676596,0.436123,0.240506,-0.1,0.852941,5,-0.694330,0.826087,0.743801,0.310693
4670,0.6,1.028619,0.814987,11,2007,Darin Erstad,CHW,0.538462,0.350427,0.269784,...,0.765957,0.691630,0.240506,0.6,0.205882,4,-0.828562,0.791667,0.743801,0.428619
1757,4.8,1.942259,0.155912,15,2007,Troy Glaus,TOR,0.423077,0.589744,0.404676,...,0.634043,0.704846,0.367089,4.8,0.970588,5,0.231396,0.892308,1.135274,2.857741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,2.2,2.644889,0.269085,23667,2021,Wander Franco,TBR,0.038462,0.205128,0.217626,...,0.391489,0.352423,0.348101,2.2,0.911765,0,1.000000,1.000000,1.062129,0.444889
3796,0.8,1.991924,0.825952,24618,2021,Ryan Jeffers,MIN,0.192308,0.333333,0.192446,...,0.514894,0.788546,0.240506,0.8,0.558824,0,1.000000,1.000000,0.830392,1.191924
6696,0.7,1.392724,0.988612,24655,2021,Owen Miller,CLE,0.192308,0.119658,0.055755,...,0.548936,0.700441,0.139241,0.7,0.264706,0,1.000000,1.000000,0.463475,0.692724
6108,-0.5,1.599140,0.686208,26197,2021,Andrew Vaughn,CHW,0.153846,0.692308,0.462230,...,0.570213,0.651982,0.170886,-0.5,0.205882,0,1.000000,1.000000,0.560032,2.099140
