In [60]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [61]:
stats = pd.read_csv("player_mvp_stats.csv")

In [62]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,15231,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14693,15232,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14694,15233,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14695,15234,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [63]:
del stats["Unnamed: 0"]

In [64]:
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [65]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14666,Evan Eschmeyer,0.0
14667,Gheorghe Mureșan,0.0
14669,Jim McIlvaine,0.0
14675,Mark Hendrickson,0.0


In [66]:
stats = stats.fillna(0)

In [67]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [68]:
predictors = ['Age','G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year','W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [69]:
train = stats[stats["Year"] < 2022]

In [70]:
test = stats[stats["Year"] == 2022]

In [71]:
reg = Ridge(alpha=.1)

In [72]:
reg.fit(train[predictors], train["Share"])

In [73]:
predictions = reg.predict(test[predictors])

In [74]:
predictions = pd.DataFrame(predictions, columns = ["predictions"], index=test.index)

In [75]:
predictions

Unnamed: 0,predictions
648,0.012934
649,-0.028142
650,-0.006163
651,0.016564
652,-0.004820
...,...
12508,-0.019380
12509,-0.010196
12510,0.003810
12511,0.001162


In [76]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)

In [77]:
combination

Unnamed: 0,Player,Share,predictions
648,Aaron Gordon,0.0,0.012934
649,Austin Rivers,0.0,-0.028142
650,Bol Bol,0.0,-0.006163
651,Bones Hyland,0.0,0.016564
652,Bryn Forbes,0.0,-0.004820
...,...,...,...
12508,Micah Potter,0.0,-0.019380
12509,Rodney McGruder,0.0,-0.010196
12510,Saben Lee,0.0,0.003810
12511,Saddiq Bey,0.0,0.001162


In [78]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
663,Nikola Jokić,0.875,0.190365
837,Joel Embiid,0.706,0.190462
11678,Giannis Antetokounmpo,0.595,0.21941
907,Devin Booker,0.216,0.091309
11469,Luka Dončić,0.146,0.157395
1179,Jayson Tatum,0.043,0.095902
12226,Ja Morant,0.01,0.120508
6398,Stephen Curry,0.004,0.093138
905,Chris Paul,0.002,0.078329
8241,LeBron James,0.001,0.157828


In [79]:
mean_squared_error(combination["Share"], combination["predictions"])

0.0022402416025650695

In [80]:
combination["Share"].value_counts()

0.000    593
0.001      3
0.875      1
0.706      1
0.002      1
0.216      1
0.043      1
0.004      1
0.146      1
0.595      1
0.010      1
Name: Share, dtype: int64

In [81]:
combination = combination.sort_values("Share", ascending=False)
combination["Rank"] = list(range(1, combination.shape[0]+1))

In [82]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rank
663,Nikola Jokić,0.875,0.190365,1
837,Joel Embiid,0.706,0.190462,2
11678,Giannis Antetokounmpo,0.595,0.21941,3
907,Devin Booker,0.216,0.091309,4
11469,Luka Dončić,0.146,0.157395,5
1179,Jayson Tatum,0.043,0.095902,6
12226,Ja Morant,0.01,0.120508,7
6398,Stephen Curry,0.004,0.093138,8
905,Chris Paul,0.002,0.078329,9
8241,LeBron James,0.001,0.157828,10


In [83]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rank"] = list(range(1, combination.shape[0]+1))

In [84]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank
11678,Giannis Antetokounmpo,0.595,0.21941,3,1
837,Joel Embiid,0.706,0.190462,2,2
663,Nikola Jokić,0.875,0.190365,1,3
8241,LeBron James,0.001,0.157828,10,4
11469,Luka Dončić,0.146,0.157395,5,5
6185,Kevin Durant,0.001,0.140627,12,6
12226,Ja Morant,0.01,0.120508,7,7
11820,Trae Young,0.0,0.109246,289,8
8231,Anthony Davis,0.0,0.107306,112,9
836,James Harden,0.0,0.103584,393,10


In [85]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank
663,Nikola Jokić,0.875,0.190365,1,3
837,Joel Embiid,0.706,0.190462,2,2
11678,Giannis Antetokounmpo,0.595,0.21941,3,1
907,Devin Booker,0.216,0.091309,4,17
11469,Luka Dončić,0.146,0.157395,5,5
1179,Jayson Tatum,0.043,0.095902,6,13
12226,Ja Morant,0.01,0.120508,7,7
6398,Stephen Curry,0.004,0.093138,8,15
905,Chris Paul,0.002,0.078329,9,21
3938,DeMar DeRozan,0.001,0.099241,11,11


In [86]:
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [87]:
find_ap(combination)

0.8188235294117646

In [88]:
years = list(range(1991, 2023))

In [89]:
aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns = ["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [90]:
sum(aps) / len(aps)

0.7152712173135063

In [93]:
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending = False)
    combination["Rank"] = list(range(1, combination.shape[0] + 1))
    combination = combination.sort_values("predictions", ascending = False)
    combination["Predicted_Rank"] = list(range(1, combination.shape[0] + 1))
    combination["Diff"] = combination["Rank"] - combination["Predicted_Rank"]
    return combination

In [96]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rank"] < 6].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank,Diff
1710,Karl Malone,0.857,0.192318,1,2,-1
10976,Michael Jordan,0.832,0.167629,2,3,-1
970,Grant Hill,0.327,0.128646,3,6,-3
4912,Tim Hardaway,0.207,0.059984,4,20,-16
8642,Glen Rice,0.117,0.03311,5,53,-48


In [98]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns = ["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [99]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [100]:
mean_ap

0.7152712173135063