In [246]:
#import necessary libraries
import pandas as pd
import numpy as np
from pybaseball import batting_stats
import os

In [247]:
#load data from pybaseball (takes a long time to lad so lets toss the data to csv)
if os.path.exists('batting-stats-2000-2023.csv'):
    stats = pd.read_csv('batting-stats-2000-2023.csv', index_col=0)
else:
    stats = batting_stats(start_season=2000, end_season=2023, qual=200)
    stats.to_csv('batting-stats-2000-2023.csv')

In [248]:
#only keep players with more than one season
stats = stats.groupby('IDfg', group_keys=False).filter(lambda x: len(x) > 1)

In [249]:
stats.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,GB,FB,LD,IFFB,Pitches,Balls,Strikes,IFH,BU,BUH,BB%,K%,BB/K,OBP,SLG,...,O-Swing% (pi),Z-Swing% (pi),Swing% (pi),O-Contact% (pi),Z-Contact% (pi),Contact% (pi),Zone% (pi),Pace (pi),FRM,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,LD+%,GB%+,FB%+,HR/FB%+,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,31,2,46,117,110,198,68,47,9,2,0,4,9,2,0.37,109.0,169.0,80.0,16.0,2398.0,1336.0,1062.0,5.0,0.0,0.0,0.324,0.077,4.21,0.582,0.799,...,,,,,,,,,,139,347,47,171,189,273,111,1.05,70.0,135.0,249.0,105.0,106.0,87.0,64.0,83.0,171.0,,,,,,,,0,0.127,0.191,,,,12.7
2,1109,2001,Barry Bonds,SFG,36,153,476,664,156,49,32,2,73,129,137,177,35,93,9,2,0,5,13,3,0.328,,,,,,,,,,,0.267,0.14,1.9,0.515,0.863,...,,,,,,,,,,122,298,82,152,196,313,89,,,,,,,,,,,,,,,,,,0,,,,,,12.5
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,27,3,45,129,101,232,120,41,9,3,0,5,6,1,0.362,117.0,156.0,62.0,22.0,2425.0,1444.0,981.0,8.0,0.0,0.0,0.376,0.066,5.66,0.609,0.812,...,,,,,,,,,,134,416,41,179,186,269,104,0.9,80.0,130.0,249.0,112.0,96.0,86.0,65.0,76.0,171.0,,,,,,,,0,0.124,0.164,,,,11.9
17,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,28,0,62,133,131,111,19,175,6,5,0,14,16,3,0.311,149.0,174.0,77.0,8.0,2906.0,1176.0,1730.0,11.0,0.0,0.0,0.159,0.251,0.63,0.425,0.686,...,0.224,0.654,0.427,0.452,0.826,0.722,0.471,24.5,,128,200,113,137,175,251,117,0.97,87.0,116.0,324.0,118.0,91.0,84.0,46.0,83.0,159.0,95.8,14.9,106.0,0.262,118.4,246.0,0.609,404,0.169,0.287,,,,11.6
3,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,22,1,45,111,90,148,61,58,10,2,0,7,7,0,0.341,101.0,152.0,81.0,21.0,2175.0,1141.0,1034.0,7.0,0.0,0.0,0.269,0.105,2.55,0.529,0.749,...,,,,,,,,,,127,298,65,155,174,251,102,1.09,69.0,134.0,256.0,109.0,116.0,71.0,93.0,78.0,161.0,,,,,,,,0,0.135,0.223,,,,10.2


In [250]:
#create a column that shows a player's ops from the future season
def next_ops(player):
    player = player.sort_values('Season')
    player['Next_OPS'] = player['OPS'].shift(-1)
    return player

stats = stats.groupby('IDfg', group_keys=False).apply(next_ops)

  stats = stats.groupby('IDfg', group_keys=False).apply(next_ops)


In [251]:
stats[['Name','Season','OPS','Next_OPS']].head()

Unnamed: 0,Name,Season,OPS,Next_OPS
6411,Alfredo Amezaga,2006,0.664,0.682
5776,Alfredo Amezaga,2007,0.682,0.679
6054,Alfredo Amezaga,2008,0.679,
2490,Garret Anderson,2000,0.827,0.792
3279,Garret Anderson,2001,0.792,0.871


In [252]:
#drop all columns with null values except our next ops column
temp = stats['Next_OPS']
stats.dropna(axis='columns', inplace=True)
stats['Next_OPS'] = temp

#also drop 'Events' because it seems that is not recorded for most players but they are given 0 instead of null
stats.drop(columns='Events', inplace=True)
stats.head()


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Age Rng,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS
6411,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,9,3,3,42,19,33,4,46,3,1,7,5,20,12,0.26,0.087,0.122,0.72,0.332,0.332,0.664,0.072,0.294,0.296,-11.5,36,-12.4,11.9,1.2,11.9,1.1,5.9,75,-1.48,-6.93,5.45,-19.65,-1.96,0.98,26,-1.14,-0.37,2.8,8.8,-1.4,28 - 28,-9.6,0.8,0.217,96,98,74,97,75,42,97,1.1,0.682
5776,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,14,9,2,46,30,35,0,52,4,5,4,4,13,7,0.263,0.078,0.116,0.67,0.324,0.358,0.682,0.095,0.293,0.305,-9.9,45,-12.4,13.9,1.8,20.5,2.0,6.4,79,-2.52,-8.64,6.13,-18.88,-1.82,1.04,26,-0.76,-1.68,0.9,17.2,-0.8,29 - 29,-11.5,0.9,0.199,96,88,71,95,82,58,96,2.0,0.679
6054,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,13,5,3,41,32,19,1,47,3,0,4,6,8,2,0.264,0.056,0.139,0.4,0.312,0.367,0.679,0.103,0.303,0.301,-7.5,33,-9.6,10.3,1.6,11.7,1.2,6.8,77,-0.59,-6.14,5.55,-8.48,-0.82,1.05,25,-0.96,0.39,2.4,8.0,0.5,30 - 30,-7.1,0.6,0.205,99,62,81,92,86,65,100,1.2,
2490,2,2000,Garret Anderson,ANA,28,159,647,681,185,107,40,3,35,92,117,24,5,87,0,9,1,21,7,6,0.286,0.035,0.128,0.28,0.307,0.519,0.827,0.233,0.281,0.345,2.3,92,-0.8,22.0,0.4,23.8,2.2,3.8,99,-2.17,-14.93,12.76,-9.29,-0.82,1.11,1,-0.32,-1.63,-1.3,1.4,-1.3,28 - 28,-2.1,2.5,0.214,104,37,82,88,117,139,93,2.2,0.792
3279,2,2001,Garret Anderson,ANA,29,161,672,704,194,125,39,2,28,83,123,27,4,100,0,5,0,12,13,6,0.289,0.038,0.142,0.27,0.314,0.478,0.792,0.189,0.302,0.334,4.3,92,3.4,22.0,-7.6,27.2,2.7,4.0,104,1.55,-13.88,15.43,13.57,1.4,1.08,0,1.2,0.23,0.0,-0.6,0.0,29 - 29,3.4,2.5,0.22,108,46,86,94,111,117,102,2.7,0.871


In [253]:
#lets modify some string types to help with ml
stats.dtypes[stats.dtypes == 'object']

Name       object
Team       object
Age Rng    object
dtype: object

In [254]:
#no need for age range
stats.drop(columns='Age Rng', inplace=True)

In [255]:
#convert team name to team number
numbers, teams = stats['Team'].factorize()
stats['Team_Num'] = numbers
stats

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS,Team_Num
6411,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,9,3,3,42,19,33,4,46,3,1,7,5,20,12,0.260,0.087,0.122,0.72,0.332,0.332,0.664,0.072,0.294,0.296,-11.5,36,-12.4,11.9,1.2,11.9,1.1,5.9,75,-1.48,-6.93,5.45,-19.65,-1.96,0.98,26,-1.14,-0.37,2.8,8.8,-1.4,-9.6,0.8,0.217,96,98,74,97,75,42,97,1.1,0.682,0
5776,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,14,9,2,46,30,35,0,52,4,5,4,4,13,7,0.263,0.078,0.116,0.67,0.324,0.358,0.682,0.095,0.293,0.305,-9.9,45,-12.4,13.9,1.8,20.5,2.0,6.4,79,-2.52,-8.64,6.13,-18.88,-1.82,1.04,26,-0.76,-1.68,0.9,17.2,-0.8,-11.5,0.9,0.199,96,88,71,95,82,58,96,2.0,0.679,0
6054,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,13,5,3,41,32,19,1,47,3,0,4,6,8,2,0.264,0.056,0.139,0.40,0.312,0.367,0.679,0.103,0.303,0.301,-7.5,33,-9.6,10.3,1.6,11.7,1.2,6.8,77,-0.59,-6.14,5.55,-8.48,-0.82,1.05,25,-0.96,0.39,2.4,8.0,0.5,-7.1,0.6,0.205,99,62,81,92,86,65,100,1.2,,0
2490,2,2000,Garret Anderson,ANA,28,159,647,681,185,107,40,3,35,92,117,24,5,87,0,9,1,21,7,6,0.286,0.035,0.128,0.28,0.307,0.519,0.827,0.233,0.281,0.345,2.3,92,-0.8,22.0,0.4,23.8,2.2,3.8,99,-2.17,-14.93,12.76,-9.29,-0.82,1.11,1,-0.32,-1.63,-1.3,1.4,-1.3,-2.1,2.5,0.214,104,37,82,88,117,139,93,2.2,0.792,1
3279,2,2001,Garret Anderson,ANA,29,161,672,704,194,125,39,2,28,83,123,27,4,100,0,5,0,12,13,6,0.289,0.038,0.142,0.27,0.314,0.478,0.792,0.189,0.302,0.334,4.3,92,3.4,22.0,-7.6,27.2,2.7,4.0,104,1.55,-13.88,15.43,13.57,1.40,1.08,0,1.20,0.23,0.0,-0.6,0.0,3.4,2.5,0.220,108,46,86,94,111,117,102,2.7,0.871,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3072,1009818,2001,Paul O'Neill,NYY,38,137,510,563,136,81,33,1,21,77,70,48,4,59,2,3,0,20,22,3,0.267,0.085,0.105,0.81,0.330,0.459,0.789,0.192,0.266,0.337,4.7,75,5.1,17.6,-6.3,6.4,0.6,5.5,107,0.99,-10.52,11.51,-0.08,-0.02,1.04,4,0.47,0.49,3.1,-21.3,3.1,8.2,2.0,0.227,100,103,64,99,107,119,89,0.6,,17
3816,1010978,2000,Cal Ripken,BAL,39,83,309,339,79,48,16,0,15,43,56,23,0,37,3,4,0,10,0,0,0.256,0.068,0.109,0.62,0.310,0.453,0.763,0.197,0.245,0.328,-3.8,41,-3.7,11.0,0.0,13.5,1.3,1.5,92,0.12,-6.27,6.39,-3.07,-0.26,0.98,0,-0.04,0.16,0.0,5.0,0.0,-3.6,1.2,0.221,93,71,70,89,102,117,81,1.3,0.637,27
7413,1010978,2001,Cal Ripken,BAL,40,128,477,516,114,84,16,0,14,43,68,26,1,63,2,9,2,15,0,2,0.239,0.050,0.122,0.41,0.276,0.361,0.637,0.122,0.244,0.275,-22.7,41,-21.4,16.1,0.2,-5.3,-0.5,1.3,67,-2.03,-9.86,7.82,-17.45,-1.78,1.02,3,-2.18,0.19,-0.9,-0.8,-0.9,-22.3,1.8,0.200,89,61,74,83,84,75,82,-0.5,,27
940,1013404,2000,Quilvio Veras,ATL,29,84,298,364,92,72,15,0,5,56,37,51,0,50,5,4,6,8,25,12,0.309,0.140,0.137,1.02,0.413,0.409,0.823,0.101,0.352,0.374,10.5,58,9.2,11.8,1.2,28.2,2.7,4.9,119,0.91,-6.49,7.41,13.17,1.36,1.04,2,1.44,-0.56,-0.5,7.2,-0.5,8.7,0.5,0.291,113,140,85,118,92,58,117,2.7,0.686,3


In [256]:
#drop rows with null next_ops values (keep copy for later use)
complete_stats = stats.copy()
stats.dropna(inplace=True)

In [257]:
#import machine learning libraries / functions
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

#higher alpha reduces overfitting, lower is more similar to linear regression
ridge_regression = Ridge(alpha=2)

#split time in 3 in chronological way
split = TimeSeriesSplit(n_splits=3)

#go through all features and find the 20 'best' features one by one
sfs = SequentialFeatureSelector(ridge_regression, n_features_to_select=20, direction='forward',n_jobs=3)


In [258]:
#remove predicted value column, string columns, and general columns we do not want in the sfs
#also dropping games column because that is mostly injury luck
non_sfs_columns = ['Next_OPS','Name','Team','IDfg','Season', 'G']
sfs_columns = stats.columns.drop(non_sfs_columns)


In [259]:
#scale values such that they are between 0 and 1, no negatives
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])

  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stats.loc[:, sfs_columns] = scaler.fit_transform(stats[sfs_columns])
  stat

In [260]:
stats.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,2B,3B,HR,R,RBI,BB,IBB,SO,HBP,SF,SH,GDP,SB,CS,AVG,BB%,K%,BB/K,OBP,SLG,OPS,ISO,BABIP,wOBA,wRAA,wRC,Bat,Rep,Pos,RAR,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,WPA/LI,Clutch,BsR,Def,wSB,Off,Lg,TTO%,AVG+,BB%+,K%+,OBP+,SLG+,ISO+,BABIP+,L-WAR,Next_OPS,Team_Num
6411,1,2006,Alfredo Amezaga,FLA,0.346154,132,0.31295,0.307958,0.24569,0.278302,0.152542,0.130435,0.041096,0.225352,0.078431,0.134783,0.033333,0.176744,0.096774,0.0625,0.291667,0.15625,0.25641,0.5,0.504425,0.210383,0.2075,0.120996,0.363218,0.189313,0.271154,0.117871,0.474104,0.331536,0.170199,0.18408,0.197516,0.335135,0.637011,0.269691,0.265823,0.597938,0.304527,0.27008,0.659626,0.222725,0.197592,0.186608,0.423077,0.313253,0.206047,0.484594,0.572491,0.636625,0.1875,0.210396,0.423077,0.243636,0.425532,0.212871,0.278049,0.349206,0.171233,0.117264,0.448276,0.265823,0.682,0.0
5776,1,2007,Alfredo Amezaga,FLA,0.384615,133,0.431655,0.429066,0.323276,0.316038,0.237288,0.391304,0.027397,0.253521,0.150327,0.143478,0.0,0.204651,0.129032,0.3125,0.166667,0.125,0.166667,0.291667,0.517699,0.185792,0.1925,0.1121,0.344828,0.229008,0.288462,0.161597,0.47012,0.355795,0.180475,0.228856,0.197516,0.443243,0.658363,0.323882,0.322785,0.649485,0.320988,0.217871,0.520715,0.261164,0.20175,0.194292,0.538462,0.313253,0.227324,0.30112,0.501859,0.717162,0.229167,0.198639,0.442308,0.210909,0.425532,0.188119,0.263415,0.333333,0.219178,0.169381,0.436782,0.322785,0.679,0.0
2490,2,2000,Garret Anderson,ANA,0.346154,159,0.875899,0.83218,0.668103,0.443396,0.677966,0.130435,0.479452,0.577465,0.718954,0.095652,0.041667,0.367442,0.0,0.5625,0.041667,0.65625,0.089744,0.25,0.619469,0.068306,0.2225,0.042705,0.305747,0.474809,0.427885,0.423954,0.422311,0.463612,0.258831,0.462687,0.269565,0.881081,0.608541,0.344675,0.335443,0.381443,0.403292,0.235442,0.009748,0.635953,0.253537,0.249177,0.673077,0.012048,0.25196,0.308123,0.420074,0.565676,0.194444,0.256807,0.75,0.238182,0.510638,0.061881,0.317073,0.277778,0.458904,0.433225,0.402299,0.335443,0.792,0.029412
3279,2,2001,Garret Anderson,ANA,0.384615,161,0.920863,0.871972,0.706897,0.528302,0.661017,0.086957,0.383562,0.514085,0.75817,0.108696,0.033333,0.427907,0.0,0.3125,0.0,0.375,0.166667,0.25,0.632743,0.076503,0.2575,0.040925,0.321839,0.412214,0.394231,0.340304,0.505976,0.433962,0.271676,0.462687,0.295652,0.881081,0.323843,0.3661,0.367089,0.402062,0.423868,0.422189,0.095045,0.786885,0.376985,0.371021,0.615385,0.0,0.337066,0.568627,0.468401,0.5465,0.284722,0.290842,0.75,0.249091,0.553191,0.084158,0.336585,0.325397,0.417808,0.361564,0.505747,0.367089,0.871,0.029412
1366,2,2002,Garret Anderson,ANA,0.423077,158,0.859712,0.82699,0.711207,0.443396,0.949153,0.130435,0.39726,0.584507,0.75817,0.121739,0.091667,0.334884,0.0,0.625,0.0,0.34375,0.076923,0.166667,0.707965,0.092896,0.1975,0.060498,0.363218,0.505344,0.470192,0.425856,0.52988,0.514825,0.379576,0.512438,0.396273,0.810811,0.345196,0.429742,0.43038,0.381443,0.502058,0.436747,0.272136,0.680045,0.452263,0.443469,0.5,0.012048,0.386898,0.497199,0.572491,0.474593,0.25,0.408416,0.711538,0.221818,0.638298,0.101485,0.273171,0.373016,0.527397,0.452769,0.54023,0.43038,0.885,0.029412


In [261]:
sfs.fit(stats[sfs_columns], stats['Next_OPS'])

In [262]:
predictor_list = list(sfs_columns[sfs.get_support()])
predictor_list

['Age',
 'AB',
 'R',
 'IBB',
 'SO',
 'SH',
 'GDP',
 'BB%',
 'BB/K',
 'ISO',
 'wOBA',
 'wRAA',
 'wRC',
 'Pos',
 'wRC+',
 'PH',
 'Def',
 'TTO%',
 'K%+',
 'BABIP+']

In [263]:
def backtest(data, model, predictors, start=5, step=1):
    predictions = []
    years = sorted(data['Season'].unique())
    
    for i in range(start, len(years), step):
        curr_year = years[i]
        
        train = data[data['Season'] < curr_year]
        test = data[data['Season'] == curr_year]
        
        model.fit(train[predictors], train['Next_OPS'])
        
        pred = model.predict(test[predictors])
        pred = pd.Series(pred, index=test.index)
        
        combined = pd.concat([test['Next_OPS'], pred], axis=1)
        combined.columns = ['actual', 'prediction']
        
        predictions.append(combined)
        
    return pd.concat(predictions)

In [264]:
predictions = backtest(stats, ridge_regression, predictor_list)
predictions

Unnamed: 0,actual,prediction
4852,0.756,0.747847
3013,0.694,0.728868
5591,0.645,0.730235
776,0.868,0.852522
4233,0.718,0.685550
...,...,...
7805,0.633,0.695009
7535,0.758,0.738072
5083,0.749,0.703248
1299,0.762,0.769859


In [265]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions['actual'], predictions['prediction'])

0.00712952780690804

In [266]:
stats['Next_OPS'].describe()

count    6441.000000
mean        0.755747
std         0.103019
min         0.398000
25%         0.687000
50%         0.749000
75%         0.817000
max         1.422000
Name: Next_OPS, dtype: float64

In [267]:
def player_history(df):
    df = df.sort_values('Season')
    
    df['Player_Season'] = range(0,len(df))
    df['OPS_Corr'] = list(df[['Player_Season','OPS']].expanding().corr().loc[(slice(None),'Player_Season'), 'OPS'])
    df.fillna({'OPS_Corr': 1}, inplace=True)
    
    df['OPS_Diff'] = df['OPS'] / df['OPS'].shift(1)
    df.fillna({'OPS_Diff': 1}, inplace=True)
    
    df.loc[df['OPS_Diff'] == np.inf, 'OPS_Diff'] = 1
    
    return df

stats = stats.groupby('IDfg', group_keys=False).apply(player_history)

  stats = stats.groupby('IDfg', group_keys=False).apply(player_history)


In [268]:
def group_averages(df):
    return df['OPS'] / df['OPS'].mean()

In [269]:
stats['OPS_Season'] = stats.groupby('Season', group_keys=False).apply(group_averages)

  stats['OPS_Season'] = stats.groupby('Season', group_keys=False).apply(group_averages)


In [270]:
new_predictor_list = predictor_list + ['Player_Season','OPS_Corr','OPS_Season','OPS_Diff']

In [271]:
predictions = backtest(stats, ridge_regression, new_predictor_list)

In [272]:
mean_squared_error(predictions['actual'], predictions['prediction'])

0.0069771620958287

In [273]:
pd.Series(ridge_regression.coef_, index=new_predictor_list).sort_values()

SO              -0.102285
Age             -0.102024
wRC+            -0.090541
SH              -0.076352
TTO%            -0.074780
AB              -0.046808
OPS_Diff        -0.046151
Pos             -0.040342
BABIP+          -0.031326
Def             -0.015242
OPS_Corr        -0.007629
PH              -0.007434
Player_Season    0.000550
BB/K             0.018882
GDP              0.035914
OPS_Season       0.036129
wRAA             0.079948
R                0.084883
BB%              0.102179
wOBA             0.103470
K%+              0.112557
IBB              0.128155
wRC              0.142785
ISO              0.153475
dtype: float64

In [274]:
merged = predictions.merge(stats, left_index=True, right_index=True)

In [275]:
merged['diff'] = (predictions['actual'] - predictions['prediction']).abs()

In [276]:
merged[['IDfg', 'Season', 'Name','OPS', 'Next_OPS','diff']].sort_values(['diff'])

Unnamed: 0,IDfg,Season,Name,OPS,Next_OPS,diff
2516,10816,2017,Jedd Gyorko,0.414423,0.762,0.000022
2776,6885,2013,Ian Desmond,0.386538,0.743,0.000039
2943,3892,2011,Josh Reddick,0.386538,0.768,0.000053
2113,6368,2013,Adam Jones,0.412500,0.780,0.000058
5023,21523,2022,Jonathan India,0.310577,0.746,0.000100
...,...,...,...,...,...,...
3688,344,2007,Wily Mo Pena,0.361538,0.509,0.286458
713,319,2010,Adam Dunn,0.490385,0.569,0.295768
4916,96,2007,Andruw Jones,0.328846,0.505,0.300537
2058,5310,2010,Tyler Colvin,0.417308,0.509,0.304560


In [277]:
mean_squared_error(predictions['actual'], predictions['prediction'])

0.0069771620958287

In [281]:
stats['Next_OPS'].describe()

count    6441.000000
mean        0.371104
std         0.097547
min         0.000000
25%         0.305769
50%         0.365385
75%         0.428846
max         1.000000
Name: OPS, dtype: float64