# **Predicting the MVP with Machine Learning 🤖 Part[3 of 3]**

In this notebook, we dive into the exciting world of machine learning with a focus on predicting the MVP.We'll explore different models and employ feature engineering techniques to enhance our machine learning capabilities.Let's get started on our MVP prediction journey!

### Loading the Library and data

In [93]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [94]:
# loading the data
stats=pd.read_csv('player_mvp_stats.csv')
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15231,15231,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
15232,15232,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
15233,15233,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
15234,15234,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [95]:
del stats['Unnamed: 0']

In [96]:
#checking is there any null value
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          61
3P            0
3PA           0
3P%        2102
2P            0
2PA           0
2P%         105
eFG%         61
FT            0
FTA           0
FT%         545
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [97]:
# selecting the Player and 3PA
stats[pd.isnull(stats['3P%'])][['Player','3PA']]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
15205,Evan Eschmeyer,0.0
15206,Gheorghe Mureșan,0.0
15208,Jim McIlvaine,0.0
15214,Mark Hendrickson,0.0


In [98]:
stats[pd.isnull(stats['FT%'])][['Player','FTA','FT%']]

Unnamed: 0,Player,FTA,FT%
77,John Coker,0.0,
92,Jason Sasser,0.0,
103,Adrian Caldwell,0.0,
119,Bruno Šundov,0.0,
158,Jamal Robinson,0.0,
...,...,...,...
15115,Trevor Keels,0.0,
15123,Luke Zeller,0.0,
15176,Myron Brown,0.0,
15198,Malcolm Lee,0.0,


In [99]:
# fill the null values with 0
stats=stats.fillna(0)

In [100]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [101]:
#define a variable with some predictor column
predictor=['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', 
           '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB','DRB', 'TRB', 
           'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'year','W', 'L', 'W/L%', 
           'GB', 'PS/G','PA/G', 'SRS']

In [102]:
# define train and test variable with conditions
train=stats[stats['year'] < 2023]
test=stats[stats['year'] == 2023]

In [103]:
# inilizied the Ridge model
reg = Ridge(alpha=.1)

In [104]:
#fit the model with training data
reg.fit(train[predictor], train['Share'])

Ridge(alpha=0.1)

In [105]:
#predict the test data
predictions= reg.predict(test[predictor])
predictions

array([ 2.56554307e-03,  3.20375091e-02,  4.16564092e-02,  2.25129787e-01,
       -1.94867228e-03, -1.03897246e-02,  1.20253888e-03, -9.81377922e-03,
       -4.45897812e-03,  5.54195436e-02,  3.80662680e-02,  6.09088128e-03,
       -1.04248359e-02, -1.16177086e-02, -3.69851137e-03, -1.25079079e-02,
       -2.27429273e-03, -1.29307761e-02, -2.90396216e-03, -4.40946608e-02,
        5.78600689e-02, -1.80685370e-02,  1.17815203e-03,  3.95532147e-03,
       -1.95294841e-02, -4.82134381e-03, -2.41343637e-03, -2.78503547e-02,
       -1.70236031e-02,  8.48964361e-03,  8.20119823e-02,  1.95437091e-01,
       -5.22476758e-03, -3.16106431e-02, -1.02055986e-02, -4.01858201e-02,
       -1.73525870e-03, -6.74210774e-03, -6.78097413e-03, -1.86793817e-02,
       -2.42784641e-02, -5.64283763e-03, -1.14290220e-02,  2.71184158e-02,
       -1.10719700e-02,  3.18072372e-02,  4.29046162e-03, -9.22251286e-03,
        1.70234641e-03, -1.69926335e-02,  7.81007101e-03,  2.01511194e-03,
       -1.65586635e-03,  

In [106]:
# make a prediction into a dataframe
predictions=pd.DataFrame(predictions, columns=['predictor'], index=test.index)
predictions

Unnamed: 0,predictor
211,0.002566
212,0.032038
213,0.041656
214,0.225130
215,-0.001949
...,...
15111,-0.014597
15112,0.004125
15113,-0.015199
15114,0.018389


In [107]:
# combine the two dataframe into a single dataframe
combination = pd.concat([test[['Player', 'Share']], predictions], axis=1)
combination

Unnamed: 0,Player,Share,predictor
211,A.J. Green,0.000,0.002566
212,Bobby Portis,0.000,0.032038
213,Brook Lopez,0.000,0.041656
214,Giannis Antetokounmpo,0.606,0.225130
215,Goran Dragić,0.000,-0.001949
...,...,...,...
15111,Mitchell Robinson,0.000,-0.014597
15112,Obi Toppin,0.000,0.004125
15113,Quentin Grimes,0.000,-0.015199
15114,RJ Barrett,0.000,0.018389


In [108]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictor
14849,Joel Embiid,0.915,0.201829
736,Nikola Jokić,0.674,0.173426
214,Giannis Antetokounmpo,0.606,0.22513
2898,Jayson Tatum,0.28,0.137573
1322,Shai Gilgeous-Alexander,0.046,0.148548
13682,Donovan Mitchell,0.03,0.086236
4274,Domantas Sabonis,0.027,0.092882
306,Luka Dončić,0.01,0.195437
6686,Stephen Curry,0.005,0.106767
10869,Jimmy Butler,0.003,0.109476


In [109]:
# calculating the error
mse = mean_squared_error(combination['Share'], combination['predictor'])
mse

0.002662435040088955

In [110]:
combination['Share'].value_counts()

0.000    526
0.001      2
0.606      1
0.010      1
0.674      1
0.046      1
0.280      1
0.002      1
0.027      1
0.005      1
0.003      1
0.030      1
0.915      1
Name: Share, dtype: int64

In [111]:
combination = combination.sort_values('Share', ascending=False)
combination['RK'] = list(range(1, combination.shape[0] + 1))

In [112]:
combination.head(10)

Unnamed: 0,Player,Share,predictor,RK
14849,Joel Embiid,0.915,0.201829,1
736,Nikola Jokić,0.674,0.173426,2
214,Giannis Antetokounmpo,0.606,0.22513,3
2898,Jayson Tatum,0.28,0.137573,4
1322,Shai Gilgeous-Alexander,0.046,0.148548,5
13682,Donovan Mitchell,0.03,0.086236,6
4274,Domantas Sabonis,0.027,0.092882,7
306,Luka Dončić,0.01,0.195437,8
6686,Stephen Curry,0.005,0.106767,9
10869,Jimmy Butler,0.003,0.109476,10


In [113]:
combination = combination.sort_values('predictor', ascending=False)
combination['P_RK'] = list(range(1, combination.shape[0] + 1))

In [114]:
combination.head(10)

Unnamed: 0,Player,Share,predictor,RK,P_RK
214,Giannis Antetokounmpo,0.606,0.22513,3,1
14849,Joel Embiid,0.915,0.201829,1,2
306,Luka Dončić,0.01,0.195437,8,3
736,Nikola Jokić,0.674,0.173426,2,4
1322,Shai Gilgeous-Alexander,0.046,0.148548,5,5
11528,Kevin Durant,0.0,0.141083,50,6
8563,Anthony Davis,0.0,0.140287,139,7
2898,Jayson Tatum,0.28,0.137573,4,8
8083,Damian Lillard,0.0,0.136843,90,9
8570,LeBron James,0.0,0.13383,132,10


In [115]:
combination.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,predictor,RK,P_RK
14849,Joel Embiid,0.915,0.201829,1,2
736,Nikola Jokić,0.674,0.173426,2,4
214,Giannis Antetokounmpo,0.606,0.22513,3,1
2898,Jayson Tatum,0.28,0.137573,4,8
1322,Shai Gilgeous-Alexander,0.046,0.148548,5,5
13682,Donovan Mitchell,0.03,0.086236,6,23
4274,Domantas Sabonis,0.027,0.092882,7,18
306,Luka Dončić,0.01,0.195437,8,3
6686,Stephen Curry,0.005,0.106767,9,14
10869,Jimmy Butler,0.003,0.109476,10,13


In [120]:
def find_app(combination):
    actual = combination.sort_values('Share', ascending=False).head(5)
    predict = combination.sort_values('predictor', ascending=False)
    ps=[]
    found=0
    seen=1
    for index, row in predict.iterrows():
        if row['Player'] in actual['Player'].values:
            found += 1
            ps.append(found/seen)
        seen +=1
    return sum(ps)/len(ps)

In [117]:
find_app(combination)

0.835

In [127]:
years= list(range(1991, 2024))
aps=[]
all_predictor=[]
for year in years[5:]:
    train = stats[stats['year'] < year]
    test = stats[stats['year'] == year]
    reg.fit(train[predictor], train['Share'])
    prediction=reg.predict(test[predictor])
    prediction=pd.DataFrame(prediction, columns=['predictor'], index=test.index)
    combination = pd.concat([test[['Player','Share']], prediction], axis=1)
    all_predictor.append(combination)
    aps.append(find_app(combination))

In [128]:
sum(aps)/len(aps)

0.7195472452665953

In [135]:
def add_rank(combination):
    combination = combination.sort_values('Share', ascending=False)
    combination['RK'] = list(range(1, combination.shape[0] + 1))
    combination = combination.sort_values('predictor', ascending=False)
    combination['P_RK'] = list(range(1, combination.shape[0] + 1))
    combination['Diff'] = combination['RK'] - combination['P_RK']
    return combination

In [142]:
ranking = add_rank(all_predictor[1])
ranking[ranking['RK'] < 5].sort_values('Diff', ascending=False)

Unnamed: 0,Player,Share,predictor,RK,P_RK,Diff
1817,Karl Malone,0.857,0.192318,1,2,-1
11360,Michael Jordan,0.832,0.167629,2,3,-1
1043,Grant Hill,0.327,0.128646,3,6,-3
5151,Tim Hardaway,0.207,0.059984,4,20,-16


In [146]:
def backtest(stats, model, years, predictors):
    aps=[]
    all_predictor=[]
    for year in years[5:]:
        train = stats[stats['year'] < year]
        test = stats[stats['year'] == year]
        model.fit(train[predictor], train['Share'])
        prediction=model.predict(test[predictor])
        prediction=pd.DataFrame(prediction, columns=['predictor'], index=test.index)
        combination = pd.concat([test[['Player','Share']], prediction], axis=1)
        combination = add_rank(combination)
        all_predictor.append(combination)
        aps.append(find_app(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictor)

In [147]:
mean_ap, aps, all_predictor = backtest(stats, reg, years[5:], predictor)

In [148]:
mean_ap

0.7499320273007849

In [150]:
all_predictor[all_predictor['RK'] <= 5].sort_values('Diff').head(10)

Unnamed: 0,Player,Share,predictor,RK,P_RK,Diff
1441,Jason Kidd,0.712,0.02821,2,52,-50
5674,Steve Nash,0.839,0.0341,1,45,-44
9277,Peja Stojaković,0.228,0.03627,4,38,-34
5692,Steve Nash,0.739,0.054129,1,34,-33
13800,Joakim Noah,0.258,0.046968,4,37,-33
4010,Chauncey Billups,0.344,0.052696,5,35,-30
1606,Chris Paul,0.138,0.072293,5,33,-28
5707,Steve Nash,0.785,0.074421,2,21,-19
980,Devin Booker,0.216,0.091309,4,17,-13
12426,Tony Parker,0.274,0.076004,5,16,-11


In [152]:
reg.coef_

array([ 2.62774559e-04,  8.67586841e-05,  1.03607340e-06, -3.97228529e-03,
        1.96829583e-03,  5.04466118e-03, -1.71971062e-01,  6.17681312e-04,
       -9.42905410e-03, -9.33310724e-03,  1.73904594e-02, -1.65005239e-02,
        8.65097658e-03,  1.04299616e-01, -5.72985513e-03,  1.02302505e-02,
       -5.35254100e-03,  2.03943758e-02,  3.34217277e-02, -2.60667667e-02,
        6.95264739e-03,  1.20379266e-02,  1.05705716e-02, -9.14224064e-03,
       -2.56249969e-03,  6.77873406e-03, -1.92183192e-04,  1.20395284e-04,
       -2.78642857e-04,  2.81237806e-02,  2.44468655e-04, -7.24524003e-04,
       -3.46097598e-05, -4.42577797e-04])

In [153]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictor)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.1043,eFG%
18,0.033422,DRB
29,0.028124,W/L%
17,0.020394,ORB
10,0.01739,2P
21,0.012038,STL
22,0.010571,BLK
15,0.01023,FTA
12,0.008651,2P%
20,0.006953,AST


In [155]:
stats_ratio = stats[['PTS', 'AST', 'STL', 'BLK', '3P', 'year']].groupby('year').apply(lambda x: x/x.mean())

In [156]:
stats_ratio

Unnamed: 0,PTS,AST,STL,BLK,3P,year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
15231,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
15232,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
15233,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
15234,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [157]:
stats[['PTS_T', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = stats_ratio[['PTS', 'AST', 'STL', 'BLK', '3P']]
stats.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576
5,Larry Drew,PG,32,LAL,48,2,10.3,1.1,2.6,0.432,...,0.707,5.0,106.3,99.6,6.73,0.322931,1.16865,0.411912,0.0,1.52576
6,Magic Johnson,PG,31,LAL,79,79,37.1,5.9,12.4,0.477,...,0.707,5.0,106.3,99.6,6.73,2.160294,5.843249,1.78495,0.44898,5.085865
7,Mychal Thompson,C,36,LAL,72,4,15.0,1.6,3.2,0.496,...,0.707,5.0,106.3,99.6,6.73,0.445421,0.140238,0.411912,0.673469,0.0
8,Sam Perkins,PF,29,LAL,73,66,34.3,5.0,10.2,0.495,...,0.707,5.0,106.3,99.6,6.73,1.503297,0.70119,1.235735,2.469388,1.017173
9,Terry Teagle,SG,30,LAL,82,0,18.3,4.1,9.2,0.443,...,0.707,5.0,106.3,99.6,6.73,1.102418,0.46746,0.549215,0.22449,0.0


In [158]:
predictor += ['PTS_T', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

In [159]:
mean_ap, aps, all_predictor = backtest(stats, reg, years[5:], predictor)

In [160]:
mean_ap

0.7551878397318482

In [162]:
stats['NPos'] = stats['Pos'].astype('category').cat.codes
stats['NTm'] = stats['Tm'].astype('category').cat.codes

In [169]:
from sklearn.ensemble import RandomForestRegressor

rf= RandomForestRegressor(n_estimators=100, random_state=42, min_samples_split=4)
mean_ap, aps, all_predictor = backtest(stats, rf, years[26:], predictor)

In [170]:
mean_ap

0.8335714285714286

In [171]:
mean_ap, aps, all_predictor = backtest(stats, reg, years[26:], predictor)

In [172]:
mean_ap

0.8182234432234432