In [5]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [10]:
START = 2014
END = 2024

In [12]:
batting = batting_stats(START,END,qual=200)
batting.to_csv("batting.csv")
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [13]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246,0.609,404,0.169,0.287,,,,11.4
4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217,0.500,434,0.220,0.270,,,,10.4
0,15640,2024,Aaron Judge,NYY,32,148,526,660,169,81,...,117.5,225,0.608,370,0.150,0.270,,,,10.2
62,25764,2024,Bobby Witt Jr.,KCR,24,150,598,662,198,113,...,116.9,245,0.484,506,0.137,0.237,,,,9.4
5,10155,2018,Mike Trout,LAA,26,140,471,608,147,80,...,118.0,162,0.460,352,0.201,0.261,,,,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2983,393,2015,Victor Martinez,DET,36,120,440,485,108,77,...,108.9,131,0.332,395,0.163,0.223,,,,-2.0
3501,3179,2016,Dioner Navarro,- - -,32,101,304,334,63,42,...,104.3,52,0.218,238,0.187,0.287,,,,-1.8
3648,12155,2024,Eddie Rosario,- - -,32,91,297,319,52,30,...,110.2,96,0.425,226,0.146,0.281,,,,-2.2
3674,3448,2019,Jeff Mathis,TEX,36,88,228,244,36,25,...,105.5,37,0.261,142,0.155,0.322,,,,-2.1


In [16]:
# each player is split into group, and each group we are computing the next WAR for next season
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [18]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
3245,Will Venable,2014,0.7,1.1
2721,Will Venable,2015,1.1,
55,Victor Martinez,2014,4.6,-2.0
2983,Victor Martinez,2015,-2.0,0.9
709,Victor Martinez,2016,0.9,-0.9
...,...,...,...,...
570,Seiya Suzuki,2024,3.0,
2487,Zach Neto,2023,1.1,3.2
1310,Zach Neto,2024,3.2,
1076,Masataka Yoshida,2023,0.6,1.0


In [20]:
null_count = batting.isnull().sum()

In [22]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         3417
xSLG        3417
xwOBA       3417
L-WAR          0
Next_WAR     745
Length: 321, dtype: int64

In [24]:
complete_cols = list(batting.columns[null_count == 0])

In [28]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [30]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Soft%+,Med%+,Hard%+,Barrels,HardHit,Events,CStr%,CSW%,L-WAR,Next_WAR
3245,211,2014,Will Venable,SDP,31,146,406,448,91,68,...,111,107,81,0,0,0,0.162,0.279,0.7,1.1
2721,211,2015,Will Venable,- - -,32,135,349,390,85,63,...,139,89,96,6,85,257,0.183,0.284,1.1,
55,393,2014,Victor Martinez,DET,35,151,561,641,188,123,...,56,95,136,0,0,0,0.183,0.217,4.6,-2.0
2983,393,2015,Victor Martinez,DET,36,120,440,485,108,77,...,96,98,107,12,131,395,0.163,0.223,-2.0,0.9
709,393,2016,Victor Martinez,DET,37,154,553,610,160,111,...,87,90,123,34,194,466,0.158,0.231,1.0,-0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,30116,2024,Seiya Suzuki,CHC,29,122,477,542,131,80,...,87,90,123,39,161,332,0.207,0.295,3.4,
2487,31347,2023,Zach Neto,LAA,22,84,289,329,65,39,...,81,108,97,19,86,216,0.161,0.290,1.1,3.2
1310,31347,2024,Zach Neto,LAA,23,145,504,561,127,74,...,86,102,104,33,146,382,0.162,0.286,3.1,
1076,31837,2023,Masataka Yoshida,BOS,29,140,537,580,155,104,...,123,99,91,30,186,458,0.212,0.285,0.6,1.0


In [32]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 237, dtype: object

In [34]:
# delete strings for ML model
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [36]:
batting["Dol"]

3245       $5.6
2721       $8.7
55        $35.0
2983    ($15.8)
709        $7.5
         ...   
570       $24.2
2487       $8.9
1310      $25.6
1076       $4.8
819        $8.0
Name: Dol, Length: 3417, dtype: object

In [38]:
del batting["Dol"]

In [40]:
batting["Age Rng"]

3245    31 - 31
2721    32 - 32
55      35 - 35
2983    36 - 36
709     37 - 37
         ...   
570     29 - 29
2487    22 - 22
1310    23 - 23
1076    29 - 29
819     30 - 30
Name: Age Rng, Length: 3417, dtype: object

In [42]:
del batting ["Age Rng"]

In [44]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [60]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [52]:
#feature selector
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [64]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [66]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [70]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Med%+,Hard%+,Barrels,HardHit,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,...,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0,2672.0
mean,11434.266841,2018.341692,0.387025,0.631933,0.484556,0.475577,0.401651,0.357315,0.345638,0.139321,...,0.509177,0.470563,0.191264,0.345353,0.476845,0.429918,0.57943,0.32735,1.754603,0.479391
std,5817.800344,2.938598,0.155234,0.268864,0.256725,0.272797,0.210147,0.179149,0.166356,0.143645,...,0.135642,0.144138,0.149849,0.195875,0.24685,0.149195,0.121261,0.131771,1.871375,0.307899
min,211.0,2014.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.5,0.0
25%,6547.0,2016.0,0.26087,0.439655,0.265267,0.244123,0.225641,0.210191,0.210526,0.0,...,0.42029,0.377049,0.075472,0.206452,0.307179,0.330144,0.497797,0.235714,0.4,0.2
50%,11477.0,2018.0,0.391304,0.689655,0.498092,0.488246,0.397436,0.343949,0.333333,0.133333,...,0.507246,0.47541,0.160377,0.341935,0.494992,0.425837,0.581498,0.307143,1.4,0.466667
75%,15297.0,2021.0,0.478261,0.862069,0.708015,0.714286,0.564103,0.484076,0.45614,0.2,...,0.594203,0.565574,0.283019,0.493548,0.676127,0.5311,0.660793,0.4,2.8,0.733333
max,31837.0,2023.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.1,1.0


In [75]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [77]:
predictors = list(selected_columns[sfs.get_support()])

In [89]:
# generate predictions for us, only using past data to predict future data rather than cross validating
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    years = sorted(data["Season"].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [91]:
predictions = backtest(batting, rr, predictors)

In [93]:
predictions

Unnamed: 0,actual,prediction
2240,0.1,-0.586765
1827,0.3,-0.350429
1101,0.8,1.183308
228,0.8,2.839742
35,1.9,1.181905
...,...,...
2199,1.5,2.127515
581,1.6,2.806630
552,3.0,2.731215
2487,3.2,2.298302


In [97]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.4766748349695797

In [99]:
batting["Next_WAR"].describe()

count    2672.000000
mean        1.754603
std         1.871375
min        -2.500000
25%         0.400000
50%         1.400000
75%         2.800000
max        11.100000
Name: Next_WAR, dtype: float64

In [101]:
2.4766748349695797 ** .5

1.5737454797296733

In [107]:
def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace=True)

    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)

    df["war_diff"][df["war_diff"] == np.inf] = 1

    return df
batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [109]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [111]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [115]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [117]:
predictions = backtest(batting, rr, new_predictors)

In [119]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.314519453920971

In [123]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -3.187484
Hard%           -2.211383
WAR             -2.001281
G               -1.878124
BABIP           -1.282331
BU              -1.198756
O-Swing% (sc)   -1.047562
CSW%            -0.883180
HR/FB           -0.864172
Pull%           -0.630402
vCH (sc)        -0.627419
Clutch          -0.533330
war_diff        -0.388085
war_corr        -0.214787
player_season    0.050871
PH               0.433749
HardHit          0.522575
Def              0.769284
IBB              0.890508
Spd              1.065758
F-Strike%        1.129046
Pitches          2.169500
war_season       2.603922
Hard%+           3.677134
dtype: float64

In [125]:
diff = predictions["actual"] = predictions["prediction"]

In [127]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [129]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [133]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
2240,1177,2019,Albert Pujols,0.161765,0.1,0.0
1795,17027,2022,Alex Verdugo,0.264706,1.4,0.0
1439,16997,2022,Gleyber Torres,0.375000,3.6,0.0
2278,16939,2022,Lane Thomas,0.279412,3.2,0.0
2362,16930,2022,Jonah Heim,0.382353,4.0,0.0
...,...,...,...,...,...,...
856,5760,2021,Avisail Garcia,0.352941,-1.0,0.0
3311,5517,2021,Kyle Higashioka,0.235294,1.6,0.0
3487,5497,2021,Marwin Gonzalez,0.139706,0.3,0.0
1215,6153,2021,Eduardo Escobar,0.367647,1.9,0.0
