In [498]:
#import necessary libraries to gather and clean data
import pandas as pd
import numpy as np
from pybaseball import batting_stats
import os

In [499]:
#load data from pybaseball (takes a long time to load so lets toss the data to csv for future use)
if os.path.exists('batting-stats-2000-2024.csv'):
    stats = pd.read_csv('batting-stats-2000-2024.csv', index_col=0)
else:
    stats = batting_stats(start_season=2000, end_season=2024, qual=200)
    stats.to_csv('batting-stats-2000-2024.csv')

In [500]:
#only keep players with more than one season
stats = stats.groupby('IDfg', group_keys=False).filter(lambda x: len(x) > 1)

In [501]:
stats.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,GB,FB,LD,IFFB,Pitches,Balls,Strikes,IFH,BU,BUH,BB%,K%,BB/K,OBP,SLG,...,O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi),FRM,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,LD+%,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,31,2,46,117,110,198,68,47,9,2,0,4,9,2,0.37,109.0,169.0,80.0,16.0,2398.0,1336.0,1062.0,5.0,0.0,0.0,0.324,0.077,4.21,0.582,0.799,...,,,,,,,,,,139,347,47,171,189,273,111,1.05,70.0,135.0,249.0,105.0,106.0,87.0,64.0,83.0,171.0,,,,,,,,0,0.127,0.191,,,,12.7
2,1109,2001,Barry Bonds,SFG,36,153,476,664,156,49,32,2,73,129,137,177,35,93,9,2,0,5,13,3,0.328,,,,,,,,,,,0.267,0.14,1.9,0.515,0.863,...,,,,,,,,,,122,298,82,152,196,313,89,,,,,,,,,,,,,,,,,,0,,,,,,12.5
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,27,3,45,129,101,232,120,41,9,3,0,5,6,1,0.362,117.0,156.0,62.0,22.0,2425.0,1444.0,981.0,8.0,0.0,0.0,0.376,0.066,5.66,0.609,0.812,...,,,,,,,,,,134,416,41,179,186,269,104,0.9,80.0,130.0,249.0,112.0,96.0,86.0,65.0,76.0,171.0,,,,,,,,0,0.124,0.164,,,,11.9
18,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,28,0,62,133,131,111,19,175,6,5,0,14,16,3,0.311,149.0,174.0,77.0,8.0,2906.0,1176.0,1730.0,11.0,0.0,0.0,0.159,0.251,0.63,0.425,0.686,...,0.224,0.654,0.427,0.452,0.826,0.722,0.471,24.5,,128,200,113,137,175,251,117,0.97,87.0,116.0,324.0,118.0,91.0,84.0,46.0,83.0,159.0,95.8,14.9,106.0,0.262,118.4,246.0,0.609,404,0.169,0.287,,,,11.6
3,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,22,1,45,111,90,148,61,58,10,2,0,7,7,0,0.341,101.0,152.0,81.0,21.0,2175.0,1141.0,1034.0,7.0,0.0,0.0,0.269,0.105,2.55,0.529,0.749,...,,,,,,,,,,127,298,65,155,174,251,102,1.09,69.0,134.0,256.0,109.0,116.0,71.0,93.0,78.0,161.0,,,,,,,,0,0.135,0.223,,,,10.2


In [None]:
#create a column that shows a player's ops from the future season
def next_ops(player):
    player = player.sort_values('Season')
    player['Next_OPS'] = player['OPS'].shift(-1)
    return player

stats = stats.groupby('IDfg', group_keys=False).apply(next_ops)

In [503]:
stats[['Name','Season','OPS','Next_OPS']].head()

Unnamed: 0,Name,Season,OPS,Next_OPS
6575,Alfredo Amezaga,2006,0.664,0.682
5923,Alfredo Amezaga,2007,0.682,0.679
6214,Alfredo Amezaga,2008,0.679,
2547,Garret Anderson,2000,0.827,0.792
3359,Garret Anderson,2001,0.792,0.871


In [504]:
#drop all columns with null values except our next ops column
temp = stats['Next_OPS']
stats.dropna(axis='columns', inplace=True)
stats['Next_OPS'] = temp

#also drop 'Events' because it seems that is not recorded for most players but they are given 0 instead of null
stats.drop(columns='Events', inplace=True)
stats.head()


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Age Rng,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS
6575,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,9,3,3,42,19,33,4,46,3,1,7,5,20,12,0.26,0.087,0.122,0.72,0.332,0.332,0.664,0.072,0.294,0.296,-11.5,36,-12.4,11.9,1.2,11.9,1.1,5.9,75,-1.48,-6.93,5.45,-19.65,-1.96,0.98,26,-1.14,-0.37,2.8,8.8,-1.4,28 - 28,-9.6,0.8,0.217,96,98,74,97,75,42,97,1.1,0.682
5923,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,14,9,2,46,30,35,0,52,4,5,4,4,13,7,0.263,0.078,0.116,0.67,0.324,0.358,0.682,0.095,0.293,0.305,-9.9,45,-12.4,13.9,1.8,20.5,2.0,6.4,79,-2.52,-8.64,6.13,-18.88,-1.82,1.04,26,-0.76,-1.68,0.9,17.2,-0.8,29 - 29,-11.5,0.9,0.199,96,88,71,95,82,58,96,2.0,0.679
6214,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,13,5,3,41,32,19,1,47,3,0,4,6,8,2,0.264,0.056,0.139,0.4,0.312,0.367,0.679,0.103,0.303,0.301,-7.5,33,-9.6,10.3,1.6,11.7,1.2,6.8,77,-0.59,-6.14,5.55,-8.48,-0.82,1.05,25,-0.96,0.39,2.4,8.0,0.5,30 - 30,-7.1,0.6,0.205,99,62,81,92,86,65,100,1.2,
2547,2,2000,Garret Anderson,ANA,28,159,647,681,185,107,40,3,35,92,117,24,5,87,0,9,1,21,7,6,0.286,0.035,0.128,0.28,0.307,0.519,0.827,0.233,0.281,0.345,2.3,92,-0.8,22.0,0.4,23.8,2.2,3.8,99,-2.17,-14.93,12.76,-9.29,-0.82,1.11,1,-0.32,-1.63,-1.3,1.4,-1.3,28 - 28,-2.1,2.5,0.214,104,37,82,88,117,139,93,2.2,0.792
3359,2,2001,Garret Anderson,ANA,29,161,672,704,194,125,39,2,28,83,123,27,4,100,0,5,0,12,13,6,0.289,0.038,0.142,0.27,0.314,0.478,0.792,0.189,0.302,0.334,4.3,92,3.4,22.0,-7.6,27.2,2.7,4.0,104,1.55,-13.88,15.43,13.57,1.4,1.08,0,1.2,0.23,0.0,-0.6,0.0,29 - 29,3.4,2.5,0.22,108,46,86,94,111,117,102,2.7,0.871


In [505]:
#lets modify some string types to help with ml
stats.dtypes[stats.dtypes == 'object']

Name       object
Team       object
Age Rng    object
dtype: object

In [506]:
#no need for age range
stats.drop(columns='Age Rng', inplace=True)

In [507]:
#convert team name to team number
numbers, teams = stats['Team'].factorize()
stats['Team_Num'] = numbers
stats.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS,Team_Num
6575,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,9,3,3,42,19,33,4,46,3,1,7,5,20,12,0.260,0.087,0.122,0.72,0.332,0.332,0.664,0.072,0.294,0.296,-11.5,36,-12.4,11.9,1.2,11.9,1.1,5.9,75,-1.48,-6.93,5.45,-19.65,-1.96,0.98,26,-1.14,-0.37,2.8,8.8,-1.4,-9.6,0.8,0.217,96,98,74,97,75,42,97,1.1,0.682,0
5923,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,14,9,2,46,30,35,0,52,4,5,4,4,13,7,0.263,0.078,0.116,0.67,0.324,0.358,0.682,0.095,0.293,0.305,-9.9,45,-12.4,13.9,1.8,20.5,2.0,6.4,79,-2.52,-8.64,6.13,-18.88,-1.82,1.04,26,-0.76,-1.68,0.9,17.2,-0.8,-11.5,0.9,0.199,96,88,71,95,82,58,96,2.0,0.679,0
6214,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,13,5,3,41,32,19,1,47,3,0,4,6,8,2,0.264,0.056,0.139,0.40,0.312,0.367,0.679,0.103,0.303,0.301,-7.5,33,-9.6,10.3,1.6,11.7,1.2,6.8,77,-0.59,-6.14,5.55,-8.48,-0.82,1.05,25,-0.96,0.39,2.4,8.0,0.5,-7.1,0.6,0.205,99,62,81,92,86,65,100,1.2,,0
2547,2,2000,Garret Anderson,ANA,28,159,647,681,185,107,40,3,35,92,117,24,5,87,0,9,1,21,7,6,0.286,0.035,0.128,0.28,0.307,0.519,0.827,0.233,0.281,0.345,2.3,92,-0.8,22.0,0.4,23.8,2.2,3.8,99,-2.17,-14.93,12.76,-9.29,-0.82,1.11,1,-0.32,-1.63,-1.3,1.4,-1.3,-2.1,2.5,0.214,104,37,82,88,117,139,93,2.2,0.792,1
3359,2,2001,Garret Anderson,ANA,29,161,672,704,194,125,39,2,28,83,123,27,4,100,0,5,0,12,13,6,0.289,0.038,0.142,0.27,0.314,0.478,0.792,0.189,0.302,0.334,4.3,92,3.4,22.0,-7.6,27.2,2.7,4.0,104,1.55,-13.88,15.43,13.57,1.40,1.08,0,1.20,0.23,0.0,-0.6,0.0,3.4,2.5,0.220,108,46,86,94,111,117,102,2.7,0.871,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,1009818,2001,Paul O'Neill,NYY,38,137,510,563,136,81,33,1,21,77,70,48,4,59,2,3,0,20,22,3,0.267,0.085,0.105,0.81,0.330,0.459,0.789,0.192,0.266,0.337,4.7,75,5.1,17.6,-6.3,6.4,0.6,5.5,107,0.99,-10.52,11.51,-0.08,-0.02,1.04,4,0.47,0.49,3.1,-21.3,3.1,8.2,2.0,0.227,100,103,64,99,107,119,89,0.6,,17
3907,1010978,2000,Cal Ripken,BAL,39,83,309,339,79,48,16,0,15,43,56,23,0,37,3,4,0,10,0,0,0.256,0.068,0.109,0.62,0.310,0.453,0.763,0.197,0.245,0.328,-3.8,41,-3.7,11.0,0.0,13.5,1.3,1.5,92,0.12,-6.27,6.39,-3.07,-0.26,0.98,0,-0.04,0.16,0.0,5.0,0.0,-3.6,1.2,0.221,93,71,70,89,102,117,81,1.3,0.637,27
7614,1010978,2001,Cal Ripken,BAL,40,128,477,516,114,84,16,0,14,43,68,26,1,63,2,9,2,15,0,2,0.239,0.050,0.122,0.41,0.276,0.361,0.637,0.122,0.244,0.275,-22.7,41,-21.4,16.1,0.2,-5.3,-0.5,1.3,67,-2.03,-9.86,7.82,-17.45,-1.78,1.02,3,-2.18,0.19,-0.9,-0.8,-0.9,-22.3,1.8,0.200,89,61,74,83,84,75,82,-0.5,,27
960,1013404,2000,Quilvio Veras,ATL,29,84,298,364,92,72,15,0,5,56,37,51,0,50,5,4,6,8,25,12,0.309,0.140,0.137,1.02,0.413,0.409,0.823,0.101,0.352,0.374,10.5,58,9.2,11.8,1.2,28.2,2.7,4.9,119,0.91,-6.49,7.41,13.17,1.36,1.04,2,1.44,-0.56,-0.5,7.2,-0.5,8.7,0.5,0.291,113,140,85,118,92,58,117,2.7,0.686,3


In [508]:
#drop rows with null next_ops values (keep copy for later use)
complete_stats = stats.copy()
stats.dropna(inplace=True)

In [509]:
#import machine learning libraries / functions
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

#higher alpha reduces overfitting, lower is more similar to linear regression (linear regression yields better results than ridge regression at least for training)
linear_regression = LinearRegression()

#split time in 3 in a chronological way
split = TimeSeriesSplit(n_splits=3)

#go through all features and find the 20 'best' features one by one
sfs = SequentialFeatureSelector(linear_regression, n_features_to_select=20, direction='forward',n_jobs=3)


In [510]:
#remove predicted value column, string columns, and general columns we do not want in the sfs
#also dropping games column because that is mostly injury luck
non_sfs_columns = ['Next_OPS','Name','Team','IDfg','Season','G']
sfs_columns = stats.columns.drop(non_sfs_columns)


In [511]:
#scale values such that they are between 0 and 1, no negatives
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])

  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stat

In [512]:
stats.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS,Team_Num
6575,1,2006,Alfredo Amezaga,FLA,0.346154,132,0.31295,0.307958,0.24569,0.278302,0.152542,0.130435,0.041096,0.225352,0.078431,0.134783,0.033333,0.176744,0.088235,0.0625,0.291667,0.15625,0.25641,0.5,0.504425,0.210383,0.2075,0.120996,0.363218,0.189313,0.271154,0.117871,0.449057,0.331536,0.170199,0.18408,0.197516,0.335135,0.637011,0.269691,0.265823,0.597938,0.304527,0.27008,0.659626,0.222725,0.197592,0.186608,0.423077,0.313253,0.206047,0.484594,0.572491,0.636625,0.1875,0.210396,0.423077,0.243636,0.425532,0.212871,0.278049,0.349206,0.171233,0.117264,0.433333,0.265823,0.682,0.0
5923,1,2007,Alfredo Amezaga,FLA,0.384615,133,0.431655,0.429066,0.323276,0.316038,0.237288,0.391304,0.027397,0.253521,0.150327,0.143478,0.0,0.204651,0.117647,0.3125,0.166667,0.125,0.166667,0.291667,0.517699,0.185792,0.1925,0.1121,0.344828,0.229008,0.288462,0.161597,0.445283,0.355795,0.180475,0.228856,0.197516,0.443243,0.658363,0.323882,0.322785,0.649485,0.320988,0.217871,0.520715,0.261164,0.20175,0.194292,0.538462,0.313253,0.227324,0.30112,0.501859,0.717162,0.229167,0.198639,0.442308,0.210909,0.425532,0.188119,0.263415,0.333333,0.219178,0.169381,0.422222,0.322785,0.679,0.0
2547,2,2000,Garret Anderson,ANA,0.346154,159,0.875899,0.83218,0.668103,0.443396,0.677966,0.130435,0.479452,0.577465,0.718954,0.095652,0.041667,0.367442,0.0,0.5625,0.041667,0.65625,0.089744,0.25,0.619469,0.068306,0.2225,0.042705,0.305747,0.474809,0.427885,0.423954,0.4,0.463612,0.258831,0.462687,0.269565,0.881081,0.608541,0.344675,0.335443,0.381443,0.403292,0.235442,0.009748,0.635953,0.253537,0.249177,0.673077,0.012048,0.25196,0.308123,0.420074,0.565676,0.194444,0.256807,0.75,0.238182,0.510638,0.061881,0.317073,0.277778,0.458904,0.433225,0.388889,0.335443,0.792,0.029412
3359,2,2001,Garret Anderson,ANA,0.384615,161,0.920863,0.871972,0.706897,0.528302,0.661017,0.086957,0.383562,0.514085,0.75817,0.108696,0.033333,0.427907,0.0,0.3125,0.0,0.375,0.166667,0.25,0.632743,0.076503,0.2575,0.040925,0.321839,0.412214,0.394231,0.340304,0.479245,0.433962,0.271676,0.462687,0.295652,0.881081,0.323843,0.3661,0.367089,0.402062,0.423868,0.422189,0.095045,0.786885,0.376985,0.371021,0.615385,0.0,0.337066,0.568627,0.468401,0.5465,0.284722,0.290842,0.75,0.249091,0.553191,0.084158,0.336585,0.325397,0.417808,0.361564,0.488889,0.367089,0.871,0.029412
1396,2,2002,Garret Anderson,ANA,0.423077,158,0.859712,0.82699,0.711207,0.443396,0.949153,0.130435,0.39726,0.584507,0.75817,0.121739,0.091667,0.334884,0.0,0.625,0.0,0.34375,0.076923,0.166667,0.707965,0.092896,0.1975,0.060498,0.363218,0.505344,0.470192,0.425856,0.501887,0.514825,0.379576,0.512438,0.396273,0.810811,0.345196,0.429742,0.43038,0.381443,0.502058,0.436747,0.272136,0.680045,0.452263,0.443469,0.5,0.012048,0.386898,0.497199,0.572491,0.474593,0.25,0.408416,0.711538,0.221818,0.638298,0.101485,0.273171,0.373016,0.527397,0.452769,0.522222,0.43038,0.885,0.029412


In [513]:
#train data
sfs.fit(stats[sfs_columns], stats['Next_OPS'])

In [None]:
#return selected features aka predictors
predictor_list = list(sfs_columns[sfs.get_support()])
predictor_list

['Age',
 'AB',
 'R',
 'IBB',
 'SO',
 'SH',
 'GDP',
 'CS',
 'BB/K',
 'ISO',
 'BABIP',
 'wOBA',
 'wRAA',
 'Pos',
 'wRC+',
 'REW',
 'Def',
 'wSB',
 'BB%+',
 'K%+']

In [None]:
#train data and predict
def backtest(data, model, predictors, start=5, step=1):
    predictions = []
    years = sorted(data['Season'].unique())
    
    for i in range(start, len(years), step):
        curr_year = years[i]
        
        train = data[data['Season'] < curr_year]
        test = data[data['Season'] == curr_year]
        
        model.fit(train[predictors], train['Next_OPS'])
        
        pred = model.predict(test[predictors])
        pred = pd.Series(pred, index=test.index)
        
        combined = pd.concat([test['Next_OPS'], pred], axis=1)
        combined.columns = ['actual', 'prediction']
        
        predictions.append(combined)
        
    return pd.concat(predictions)

In [None]:
#return predictions
predictions = backtest(stats, linear_regression, predictor_list)
predictions

Unnamed: 0,actual,prediction
4971,0.756,0.752395
3085,0.694,0.726207
5732,0.645,0.723824
791,0.868,0.853943
4333,0.718,0.693636
...,...,...
5571,0.849,0.713191
5512,0.677,0.735463
1737,0.614,0.763012
1663,0.797,0.771999


In [None]:
#check accuracy of prediction (compare mean squared error with std)
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions['actual'], predictions['prediction'])

0.007177099358276269

In [None]:
stats['Next_OPS'].describe()

count    6644.000000
mean        0.755110
std         0.103076
min         0.398000
25%         0.686000
50%         0.748500
75%         0.816000
max         1.422000
Name: Next_OPS, dtype: float64

In [None]:
#improve ml by using player trends by season
def player_history(df):
    df = df.sort_values('Season')
    
    df['Player_Season'] = range(0,len(df))
    df['OPS_Corr'] = list(df[['Player_Season','OPS']].expanding().corr().loc[(slice(None),'Player_Season'), 'OPS'])
    df.fillna({'OPS_Corr': 1}, inplace=True)
    
    df['OPS_Diff'] = df['OPS'] / df['OPS'].shift(1)
    df.fillna({'OPS_Diff': 1}, inplace=True)
    
    df.loc[df['OPS_Diff'] == np.inf, 'OPS_Diff'] = 1
    
    return df

stats = stats.groupby('IDfg', group_keys=False).apply(player_history)

In [None]:
def group_averages(df):
    return df['OPS'] / df['OPS'].mean()

In [None]:
stats['OPS_Season'] = stats.groupby('Season', group_keys=False).apply(group_averages)

In [None]:
stats

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS,Team_Num,Player_Season,OPS_Corr,OPS_Diff,OPS_Season
6575,1,2006,Alfredo Amezaga,FLA,0.346154,132,0.312950,0.307958,0.245690,0.278302,0.152542,0.130435,0.041096,0.225352,0.078431,0.134783,0.033333,0.176744,0.088235,0.0625,0.291667,0.15625,0.256410,0.500000,0.504425,0.210383,0.2075,0.120996,0.363218,0.189313,0.271154,0.117871,0.449057,0.331536,0.170199,0.184080,0.197516,0.335135,0.637011,0.269691,0.265823,0.597938,0.304527,0.270080,0.659626,0.222725,0.197592,0.186608,0.423077,0.313253,0.206047,0.484594,0.572491,0.636625,0.187500,0.210396,0.423077,0.243636,0.425532,0.212871,0.278049,0.349206,0.171233,0.117264,0.433333,0.265823,0.682,0.000000,0,1.000000,1.000000,0.680217
5923,1,2007,Alfredo Amezaga,FLA,0.384615,133,0.431655,0.429066,0.323276,0.316038,0.237288,0.391304,0.027397,0.253521,0.150327,0.143478,0.000000,0.204651,0.117647,0.3125,0.166667,0.12500,0.166667,0.291667,0.517699,0.185792,0.1925,0.112100,0.344828,0.229008,0.288462,0.161597,0.445283,0.355795,0.180475,0.228856,0.197516,0.443243,0.658363,0.323882,0.322785,0.649485,0.320988,0.217871,0.520715,0.261164,0.201750,0.194292,0.538462,0.313253,0.227324,0.301120,0.501859,0.717162,0.229167,0.198639,0.442308,0.210909,0.425532,0.188119,0.263415,0.333333,0.219178,0.169381,0.422222,0.322785,0.679,0.000000,1,1.000000,1.063830,0.749014
2547,2,2000,Garret Anderson,ANA,0.346154,159,0.875899,0.832180,0.668103,0.443396,0.677966,0.130435,0.479452,0.577465,0.718954,0.095652,0.041667,0.367442,0.000000,0.5625,0.041667,0.65625,0.089744,0.250000,0.619469,0.068306,0.2225,0.042705,0.305747,0.474809,0.427885,0.423954,0.400000,0.463612,0.258831,0.462687,0.269565,0.881081,0.608541,0.344675,0.335443,0.381443,0.403292,0.235442,0.009748,0.635953,0.253537,0.249177,0.673077,0.012048,0.251960,0.308123,0.420074,0.565676,0.194444,0.256807,0.750000,0.238182,0.510638,0.061881,0.317073,0.277778,0.458904,0.433225,0.388889,0.335443,0.792,0.029412,0,1.000000,1.000000,1.047719
3359,2,2001,Garret Anderson,ANA,0.384615,161,0.920863,0.871972,0.706897,0.528302,0.661017,0.086957,0.383562,0.514085,0.758170,0.108696,0.033333,0.427907,0.000000,0.3125,0.000000,0.37500,0.166667,0.250000,0.632743,0.076503,0.2575,0.040925,0.321839,0.412214,0.394231,0.340304,0.479245,0.433962,0.271676,0.462687,0.295652,0.881081,0.323843,0.366100,0.367089,0.402062,0.423868,0.422189,0.095045,0.786885,0.376985,0.371021,0.615385,0.000000,0.337066,0.568627,0.468401,0.546500,0.284722,0.290842,0.750000,0.249091,0.553191,0.084158,0.336585,0.325397,0.417808,0.361564,0.488889,0.367089,0.871,0.029412,1,-1.000000,0.921348,1.026145
1396,2,2002,Garret Anderson,ANA,0.423077,158,0.859712,0.826990,0.711207,0.443396,0.949153,0.130435,0.397260,0.584507,0.758170,0.121739,0.091667,0.334884,0.000000,0.6250,0.000000,0.34375,0.076923,0.166667,0.707965,0.092896,0.1975,0.060498,0.363218,0.505344,0.470192,0.425856,0.501887,0.514825,0.379576,0.512438,0.396273,0.810811,0.345196,0.429742,0.430380,0.381443,0.502058,0.436747,0.272136,0.680045,0.452263,0.443469,0.500000,0.012048,0.386898,0.497199,0.572491,0.474593,0.250000,0.408416,0.711538,0.221818,0.638298,0.101485,0.273171,0.373016,0.527397,0.452769,0.522222,0.430380,0.885,0.029412,2,0.555761,1.192683,1.249627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,1008559,2000,Mark McGwire,STL,0.653846,89,0.136691,0.209343,0.181034,0.089623,0.135593,0.000000,0.438356,0.352113,0.431373,0.321739,0.100000,0.325581,0.205882,0.1250,0.000000,0.15625,0.012821,0.000000,0.703540,0.620219,0.5100,0.165480,0.710345,0.821374,0.814423,0.819392,0.520755,0.865229,0.518947,0.427861,0.532919,0.254054,0.416370,0.507876,0.487342,0.195876,0.798354,0.505020,0.835906,0.364613,0.544497,0.524149,0.403846,0.156627,0.512878,0.390756,0.479554,0.523490,0.305556,0.529084,0.365385,0.901818,0.595745,0.556931,0.648780,0.674603,0.801370,0.807818,0.511111,0.487342,0.808,0.117647,0,1.000000,1.000000,1.994198
6398,1008712,2000,Pat Meares,PIT,0.461538,132,0.543165,0.543253,0.349138,0.287736,0.372881,0.086957,0.178082,0.316901,0.261438,0.147826,0.050000,0.386047,0.235294,0.1875,0.208333,0.40625,0.012821,0.000000,0.415929,0.163934,0.3450,0.064057,0.301149,0.264122,0.291346,0.249049,0.362264,0.336927,0.122030,0.243781,0.138509,0.589189,0.790036,0.183995,0.183544,0.298969,0.279835,0.302209,0.446791,0.407010,0.252025,0.243139,0.519231,0.072289,0.194289,0.610644,0.479554,0.576222,0.305556,0.135520,0.423077,0.343636,0.340426,0.143564,0.448780,0.269841,0.239726,0.244300,0.355556,0.183544,0.548,0.558824,0,1.000000,1.000000,0.713391
3655,1009818,2000,Paul O'Neill,NYY,0.692308,142,0.730216,0.740484,0.560345,0.485849,0.440678,0.000000,0.246575,0.485915,0.607843,0.213043,0.016667,0.381395,0.000000,0.6875,0.000000,0.53125,0.179487,0.375000,0.606195,0.193989,0.2600,0.094306,0.372414,0.329771,0.363462,0.249049,0.483019,0.425876,0.210019,0.388060,0.234161,0.789189,0.377224,0.293636,0.291139,0.350515,0.374486,0.299699,0.287571,0.514415,0.265309,0.259056,0.403846,0.012048,0.219485,0.533613,0.420074,0.560882,0.194444,0.221535,0.711538,0.309091,0.489362,0.180693,0.365854,0.341270,0.308219,0.254072,0.466667,0.291139,0.789,0.500000,0,1.000000,1.000000,0.889973
3907,1010978,2000,Cal Ripken,BAL,0.769231,83,0.267986,0.240484,0.211207,0.165094,0.271186,0.000000,0.205479,0.232394,0.320261,0.091304,0.000000,0.134884,0.088235,0.2500,0.000000,0.31250,0.000000,0.000000,0.486726,0.158470,0.1750,0.103203,0.312644,0.374046,0.366346,0.355513,0.264151,0.417790,0.219653,0.208955,0.251553,0.286486,0.594306,0.279773,0.278481,0.144330,0.374486,0.350402,0.713241,0.275862,0.287126,0.279912,0.423077,0.000000,0.267637,0.558824,0.468401,0.600192,0.284722,0.247525,0.500000,0.250909,0.393617,0.146040,0.258537,0.285714,0.356164,0.361564,0.255556,0.278481,0.637,0.794118,0,1.000000,1.000000,0.897036


In [None]:
#try with new predictors
new_predictor_list = predictor_list + ['Player_Season','OPS_Corr','OPS_Season','OPS_Diff']

In [None]:
#return predictions
predictions = backtest(stats, linear_regression, new_predictor_list)

Unnamed: 0,actual,prediction
4971,0.756,0.764046
3085,0.694,0.724491
5732,0.645,0.739116
791,0.868,0.875937
4333,0.718,0.713053


In [None]:
#check accuracy
mean_squared_error(predictions['actual'], predictions['prediction'])

0.007041035709579955

In [None]:
merged = predictions.merge(stats, left_index=True, right_index=True)

In [None]:
merged['diff'] = (predictions['actual'] - predictions['prediction']).abs()

In [None]:
merged

Unnamed: 0,actual,prediction,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS,Team_Num,Player_Season,OPS_Corr,OPS_Diff,OPS_Season,diff
4971,0.756,0.764046,2,2005,Garret Anderson,LAA,0.538462,142,0.746403,0.697232,0.573276,0.462264,0.576271,0.043478,0.232877,0.408451,0.581699,0.091304,0.066667,0.353488,0.000000,0.3125,0.000000,0.40625,0.012821,0.041667,0.606195,0.076503,0.2500,0.040925,0.308046,0.346565,0.347115,0.268061,0.490566,0.385445,0.211304,0.338308,0.244099,0.686486,0.288256,0.178954,0.183544,0.216495,0.378601,0.431225,0.366369,0.608253,0.356464,0.346872,0.538462,0.000000,0.276596,0.752101,0.420074,0.392138,0.256944,0.231436,0.653846,0.223636,0.531915,0.091584,0.346341,0.317460,0.356164,0.293160,0.500000,0.183544,0.756,0.058824,5,-0.411626,0.886978,0.920821,0.008046
3085,0.694,0.724491,10,2005,David Eckstein,STL,0.423077,158,0.845324,0.887543,0.668103,0.617925,0.440678,0.304348,0.109589,0.563380,0.352941,0.243478,0.000000,0.167442,0.382353,0.2500,0.333333,0.40625,0.141026,0.333333,0.654867,0.193989,0.0575,0.227758,0.434483,0.285496,0.361538,0.174905,0.486792,0.444744,0.289660,0.467662,0.304348,0.870270,0.839858,0.350977,0.354430,0.484536,0.427984,0.423193,0.324939,0.628038,0.370990,0.366630,0.326923,0.024096,0.311870,0.668067,0.405204,0.536913,0.180556,0.289604,0.557692,0.129091,0.563830,0.200495,0.102439,0.428571,0.287671,0.188925,0.488889,0.354430,0.694,0.117647,4,0.036531,1.301038,0.959082,0.030491
5732,0.645,0.739116,11,2005,Darin Erstad,LAA,0.461538,153,0.807554,0.807958,0.586207,0.518868,0.559322,0.130435,0.095890,0.535211,0.385621,0.195652,0.025000,0.469767,0.029412,0.1250,0.166667,0.25000,0.128205,0.125000,0.561947,0.163934,0.3100,0.069395,0.347126,0.248855,0.301923,0.169202,0.550943,0.361186,0.176622,0.348259,0.211180,0.794595,0.185053,0.301197,0.303797,0.505155,0.353909,0.397088,0.249391,0.651215,0.294362,0.282108,0.519231,0.000000,0.250280,0.728291,0.654275,0.547459,0.319444,0.237624,0.692308,0.292727,0.489362,0.193069,0.424390,0.357143,0.253425,0.185668,0.566667,0.303797,0.645,0.058824,5,-0.571292,0.862637,0.800936,0.094116
791,0.868,0.875937,15,2005,Troy Glaus,ARI,0.346154,149,0.679856,0.750865,0.469828,0.278302,0.491525,0.043478,0.506849,0.478873,0.588235,0.356522,0.016667,0.637209,0.205882,0.3125,0.000000,0.21875,0.051282,0.083333,0.495575,0.333333,0.4750,0.096085,0.434483,0.479389,0.483654,0.482890,0.407547,0.555256,0.421965,0.522388,0.400000,0.740541,0.669039,0.393195,0.398734,0.257732,0.510288,0.417671,0.277823,0.654607,0.400745,0.389682,0.557692,0.048193,0.412094,0.375350,0.565056,0.436242,0.263889,0.411510,0.519231,0.612727,0.425532,0.344059,0.609756,0.428571,0.493151,0.521173,0.411111,0.398734,0.868,0.323529,5,-0.358448,0.917883,1.283028,0.007937
4333,0.718,0.713053,19,2005,Adam Kennedy,LAA,0.384615,129,0.460432,0.449827,0.409483,0.410377,0.389831,0.000000,0.027397,0.274648,0.196078,0.117391,0.008333,0.260465,0.205882,0.1875,0.208333,0.15625,0.243590,0.166667,0.681416,0.144809,0.2500,0.072954,0.413793,0.247328,0.328846,0.114068,0.652830,0.404313,0.236994,0.273632,0.268944,0.448649,0.661922,0.394455,0.398734,0.463918,0.399177,0.385542,0.639318,0.366874,0.346096,0.335346,0.365385,0.024096,0.293953,0.596639,0.527881,0.700863,0.416667,0.274134,0.557692,0.225455,0.595745,0.168317,0.346341,0.428571,0.253425,0.123779,0.666667,0.398734,0.718,0.058824,5,0.354902,0.912000,0.872357,0.004947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,0.849,0.712862,27815,2023,Jordan Westburg,BAL,0.192308,68,0.086331,0.048443,0.103448,0.089623,0.288136,0.086957,0.041096,0.112676,0.104575,0.060870,0.000000,0.223256,0.029412,0.1875,0.000000,0.09375,0.051282,0.041667,0.504425,0.163934,0.5175,0.044484,0.314943,0.299237,0.320192,0.254753,0.607547,0.366577,0.232498,0.134328,0.270186,0.075676,0.619217,0.258349,0.259494,0.587629,0.395062,0.327811,0.910642,0.113058,0.300356,0.289243,0.307692,0.120482,0.258119,0.515406,0.501859,0.572387,0.284722,0.270421,0.403846,0.447273,0.521277,0.178218,0.434146,0.357143,0.328767,0.263844,0.611111,0.265823,0.849,0.794118,0,1.000000,1.000000,0.860267,0.136138
5512,0.677,0.727892,29622,2023,Sal Frelick,MIL,0.153846,57,0.055755,0.039792,0.073276,0.099057,0.152542,0.043478,0.041096,0.133803,0.111111,0.113043,0.000000,0.134884,0.029412,0.1875,0.000000,0.06250,0.089744,0.000000,0.442478,0.316940,0.3175,0.128114,0.383908,0.218321,0.298077,0.180608,0.418868,0.366577,0.233141,0.134328,0.261491,0.064865,0.555160,0.264650,0.265823,0.618557,0.374486,0.330321,0.914703,0.113624,0.275840,0.265642,0.519231,0.096386,0.245801,0.558824,0.509294,0.596357,0.347222,0.263614,0.384615,0.403636,0.446809,0.321782,0.278049,0.412698,0.232877,0.185668,0.422222,0.284810,0.677,0.588235,0,1.000000,1.000000,0.800849,0.050892
1737,0.614,0.772231,29766,2023,Zack Gelof,OAK,0.153846,69,0.197842,0.173010,0.181034,0.113208,0.338983,0.043478,0.191781,0.211268,0.163399,0.104348,0.008333,0.344186,0.088235,0.0625,0.000000,0.15625,0.179487,0.083333,0.535398,0.210383,0.5850,0.049822,0.374713,0.451908,0.440385,0.431559,0.588679,0.495957,0.305716,0.233831,0.348447,0.194595,0.629893,0.367990,0.367089,0.587629,0.543210,0.386044,0.829407,0.234596,0.341776,0.333150,0.307692,0.000000,0.334826,0.500000,0.550186,0.581975,0.388889,0.357054,0.461538,0.589091,0.553191,0.227723,0.492683,0.420635,0.493151,0.449511,0.600000,0.379747,0.614,0.352941,0,1.000000,1.000000,1.183190,0.158231
1663,0.797,0.761920,30116,2023,Seiya Suzuki,CHC,0.346154,138,0.638489,0.662630,0.504310,0.363208,0.525424,0.260870,0.273973,0.457746,0.437908,0.247826,0.025000,0.567442,0.058824,0.4375,0.000000,0.25000,0.076923,0.291667,0.615044,0.248634,0.4600,0.072954,0.420690,0.422901,0.442308,0.361217,0.626415,0.498652,0.368658,0.452736,0.393168,0.670270,0.366548,0.386894,0.386076,0.453608,0.514403,0.366466,0.458164,0.472018,0.355114,0.343030,0.365385,0.012048,0.328108,0.456583,0.394052,0.490892,0.083333,0.375619,0.576923,0.500000,0.617021,0.254950,0.404878,0.452381,0.458904,0.371336,0.633333,0.398734,0.797,0.617647,1,1.000000,1.185567,1.188357,0.035080


In [None]:
merged[['IDfg', 'Season','Team', 'Name','OPS', 'Next_OPS','prediction','diff']].sort_values(['diff'])

Unnamed: 0,IDfg,Season,Team,Name,OPS,Next_OPS,prediction,diff
3313,11615,2016,ARI,Brandon Drury,0.388462,0.764,0.763937,0.000063
5000,15518,2021,CLE,Amed Rosario,0.335577,0.715,0.715086,0.000086
2681,1736,2013,TOR,Jose Reyes,0.382692,0.726,0.726096,0.000096
7751,19293,2022,CIN,Nick Senzel,0.210577,0.696,0.695903,0.000097
5414,45,2010,- - -,Rod Barajas,0.335577,0.717,0.717126,0.000126
...,...,...,...,...,...,...,...,...
727,319,2010,WSN,Adam Dunn,0.490385,0.569,0.849726,0.280726
3881,8001,2012,- - -,Hanley Ramirez,0.362500,1.040,0.752072,0.287928
2106,5310,2010,CHC,Tyler Colvin,0.417308,0.509,0.809130,0.300130
5038,96,2007,ATL,Andruw Jones,0.328846,0.505,0.809228,0.304228


In [None]:
mean_squared_error(predictions['actual'], predictions['prediction'])

0.007041035709579955

In [None]:
stats['Next_OPS'].describe()

count    6644.000000
mean        0.755110
std         0.103076
min         0.398000
25%         0.686000
50%         0.748500
75%         0.816000
max         1.422000
Name: Next_OPS, dtype: float64

In [None]:
stats['Season'].sort_values()

960     2000
7618    2000
2715    2000
6143    2000
97      2000
        ... 
1669    2023
6772    2023
8032    2023
167     2023
7511    2023
Name: Season, Length: 6644, dtype: int64