In [4]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv('../data/merged_df.csv', index_col=0)

In [9]:
data.columns

Index(['slug', 'name', 'positions', 'age', 'team_feature', 'games_played',
       'minutes_played', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player', 'is_combined_totals',
       'year_end', 'season', 'nba_id', 'player_name', 'pos', 'team_target',
       'oepm', 'depm', 'epm', 'ewins', 'feature_season'],
      dtype='object')

In [20]:
data = data[data.games_played >= 10] # remove players who played less than 10 games

In [30]:
data.drop_duplicates(subset=['name', 'year_end'], inplace=True) # drop duplicates

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop_duplicates(subset=['name', 'year_end'], inplace=True)


In [33]:
# split train and test
data.year_end.value_counts().sort_index()

2013    338
2014    352
2015    354
2016    339
2017    339
2018    351
2019    352
2020    371
2021    404
Name: year_end, dtype: int64

In [34]:
data.season.value_counts().sort_index()

2014    338
2015    352
2016    354
2017    339
2018    339
2019    351
2020    352
2021    371
2022    404
Name: season, dtype: int64

In [37]:
train = data[data.season != 2022]
test = data[data.season == 2022]

In [48]:
features = ['season', 'age', 'games_played',
       'minutes_played', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus', 'value_over_replacement_player']

In [49]:
X_train = train[features]
X_test = test[features]

In [54]:
# normalize features
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train = pd.DataFrame(mms.fit_transform(X_train), columns= X_train.columns)
X_test = pd.DataFrame(mms.transform(X_test), columns= X_test.columns)

In [58]:
X_train['position'] = train['positions']
X_test['position'] = test['positions']

In [59]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [142]:
from sklearn.model_selection import cross_val_score
def eval_model(model):
    cv = cross_val_score(model, X_train, train['epm'], cv = 5, scoring='neg_mean_absolute_error')
    print(cv)
    return -1 * np.mean(cv)

In [143]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
eval_model(lr)

[-1.50587064 -1.66595901 -1.47581273 -1.48579898 -1.47137221]


1.520962713726966

In [144]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
eval_model(rf)

[-1.50022367 -1.66688375 -1.48656755 -1.51575302 -1.49140191]


1.532165980414305

In [145]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
eval_model(xgb)

[-1.56649717 -1.72247176 -1.54555242 -1.59241409 -1.6099186 ]


1.6073708077317228

In [146]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes = (8,16,32, 64), max_iter = 1000000, early_stopping = True, 
learning_rate='adaptive', learning_rate_init = 0.01)
eval_model(mlp)



[-1.4888065  -1.63978632 -1.48889599 -1.52137985 -1.50423225]




1.5286201801563815

In [147]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors = 25)
eval_model(knn)

[-1.64955712 -1.77474142 -1.61354455 -1.58546966 -1.59885034]


1.6444326190432872

In [148]:
lr.fit(X_train, train['epm'])
pd.Series(lr.coef_, index = lr.feature_names_in_).sort_values()

win_shares                      -24.057189
box_plus_minus                  -16.109877
defensive_rebound_percentage    -11.365706
offensive_rebound_percentage     -5.481806
true_shooting_percentage         -1.659651
games_played                     -1.504718
age                              -1.164079
win_shares_per_48_minutes        -0.468625
position_POWER FORWARD           -0.291697
position_POINT GUARD             -0.170867
position_SMALL FORWARD           -0.070246
position_CENTER                  -0.056832
position_SHOOTING GUARD          -0.031322
season                           -0.013864
free_throw_attempt_rate           0.043272
assist_percentage                 0.559639
steal_percentage                  0.651283
usage_percentage                  0.709379
value_over_replacement_player     0.864791
turnover_percentage               1.284329
three_point_attempt_rate          1.670534
block_percentage                  2.134591
minutes_played                    3.082983
player_effi

In [131]:
train_export = X_train.copy()
train_export['epm']  = train.epm.values

In [133]:
train_export.to_csv('../data/train_export_R.csv')

In [149]:
from sklearn.linear_model import Ridge
rlr = Ridge()
eval_model(rlr)

[-1.49768201 -1.65583408 -1.47663453 -1.48831767 -1.47554326]


1.5188023089960476

In [150]:
from sklearn.linear_model import Lasso
llr = Lasso(alpha=0.1)
eval_model(llr)

[-1.70102671 -1.81193822 -1.66970964 -1.70107438 -1.65599297]


1.707948384305856

In [158]:
# Next Steps:

# Fix the dataframes to take more minutes 

# Refit the baseline models and see the performance

# Model Focused Approach: Focus on 1-2 models and perform Hyperparameter optimization:
# XGboost 
# Random Forest
# Neural Network 
# Ridge Regression
# Lasso Regression 
# Other Models

# Data Focused Approach:  
# Feature Selection (we have a ton of features rn and it could be useful to reduce them)
# Feature Engineering (is there any additional features we can find online or just engineer)

In [155]:
data.team_target.nunique()

30

In [156]:
data.team_feature.nunique()

32

In [157]:
data

Unnamed: 0,slug,name,positions,age,team_feature,games_played,minutes_played,player_efficiency_rating,true_shooting_percentage,three_point_attempt_rate,...,season,nba_id,player_name,pos,team_target,oepm,depm,epm,ewins,feature_season
0,acyqu01,Quincy Acy,POWER FORWARD,22,TORONTO RAPTORS,29,342,15.9,0.632,0.027,...,2014,203112,Quincy Acy,SF,SAC,-2.241340,-0.860686,-3.102020,-0.088588,2013
1,adrieje01,Jeff Adrien,POWER FORWARD,26,CHARLOTTE BOBCATS,52,713,13.4,0.493,0.012,...,2014,202399,Jeff Adrien,PF,MIL,-0.222416,-2.017500,-2.239920,0.475259,2013
2,afflaar01,Arron Afflalo,SMALL FORWARD,27,ORLANDO MAGIC,64,2307,13.0,0.527,0.265,...,2014,201167,Arron Afflalo,SG,ORL,1.483060,-2.289410,-0.806353,3.702980,2013
3,aldrico01,Cole Aldrich,CENTER,24,HOUSTON ROCKETS,30,213,7.3,0.532,0.000,...,2014,202332,Cole Aldrich,C,NYK,-1.103380,1.572910,0.469531,0.767896,2013
5,aldrila01,LaMarcus Aldridge,POWER FORWARD,27,PORTLAND TRAIL BLAZERS,74,2790,20.4,0.530,0.011,...,2014,200746,LaMarcus Aldridge,PF,POR,1.502790,1.370200,2.872990,9.896980,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3759,wrighde01,Delon Wright,POINT GUARD,26,TORONTO RAPTORS,49,897,14.5,0.527,0.307,...,2020,1626153,Delon Wright,SG,DAL,-1.225830,0.707953,-0.517880,2.593900,2019
3761,youngth01,Thaddeus Young,POWER FORWARD,30,INDIANA PACERS,81,2489,16.2,0.569,0.174,...,2020,201152,Thaddeus Young,PF,CHI,-2.751630,1.493800,-1.257840,1.837610,2019
3762,youngtr01,Trae Young,POINT GUARD,20,ATLANTA HAWKS,81,2503,17.0,0.539,0.384,...,2020,1629027,Trae Young,PG,ATL,5.990290,-2.527340,3.462950,9.214310,2019
3763,zelleco01,Cody Zeller,CENTER,26,CHARLOTTE HORNETS,49,1243,17.2,0.611,0.064,...,2020,203469,Cody Zeller,C,CHA,0.510248,-1.543110,-1.032870,1.746270,2019


In [162]:
rf.fit(X_train, train['epm'])
importances = rf.feature_importances_
forest_importances = pd.DataFrame(importances, index= X_train.columns)
forest_importances


Unnamed: 0,0
season,0.012393
age,0.028232
games_played,0.022249
minutes_played,0.028147
player_efficiency_rating,0.046956
true_shooting_percentage,0.023195
three_point_attempt_rate,0.022414
free_throw_attempt_rate,0.028414
offensive_rebound_percentage,0.019517
defensive_rebound_percentage,0.022429


In [164]:
forest_importances.sort_values(by = 0, ascending = False)

Unnamed: 0,0
value_over_replacement_player,0.402117
box_plus_minus,0.065431
player_efficiency_rating,0.046956
free_throw_attempt_rate,0.028414
block_percentage,0.028301
age,0.028232
minutes_played,0.028147
usage_percentage,0.027088
steal_percentage,0.025714
win_shares,0.024808
