In [54]:
import pandas as pd
import numpy as np

In [55]:
data = pd.read_csv('merged_df.csv', index_col=0)

In [56]:
#only players with more than 10 games
data = data[data.games_played >= 10]

In [57]:
#drop duplicates - corresponds to players that were traded mid season
data.drop_duplicates(subset=['name', 'year_end'], inplace=True)

In [58]:
#split into test and train data set

train = data[data.season != 2022]
test = data[data.season == 2022]

In [59]:
#manually choose features to include in model 

features = ['age', 'minutes_played','player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'win_shares_per_48_minutes',
       'box_plus_minus']

In [60]:
#store features

X_train = train[features]
X_test = test[features]

**QUESTION**

Saw your note about possible feature pinpointing or reduction

I considered doing PCA but I do not think dimension reductionality is neccessary for computation purposes

However, doing analysis on first x PCA components could be something to throw in just for credit to show we understand material

So I am thinking we could commit to capturing x% of the variance of the data and however many components that corresponds to

Or we could do PCA for some models and not for others..

In [61]:
#standardize data - good for neural networks

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

# Fit only on training data

scaler.fit(X_train)  
X_train_standard = scaler.transform(X_train) 

# apply same transformation to test data

X_test_standard = scaler.transform(X_test) 

In [62]:
#set up grid search for alpha to tune hyperparameter

from sklearn.neural_network import MLPRegressor

parameters = {'alpha': 10.0 ** -np.arange(1, 7)}

network = MLPRegressor(hidden_layer_sizes=(50,50) * 10,
                       max_iter = 2000,activation = 'tanh',
                       solver = 'sgd',learning_rate='adaptive',
                       early_stopping = True)

parameters

{'alpha': array([1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06])}

In [63]:
#conduct grid search

from sklearn.model_selection import GridSearchCV

gs_neural = GridSearchCV(network,
                      param_grid=parameters,
                      scoring='neg_mean_absolute_error',
                      cv=5, verbose=1)

gs_neural.fit(X_train_standard, train['epm'])
gs_neural.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


{'alpha': 0.001}

In [64]:
#choose tuned parameter

best_network = MLPRegressor(hidden_layer_sizes=(50,50) * 10,
                       max_iter = 2000,activation = 'tanh',
                       solver = 'sgd',learning_rate='adaptive',
                       alpha = gs_neural.best_params_['alpha'],
                       early_stopping = True)

In [65]:
#cross validation scores on training for model with chosen hyperparameters

from sklearn.model_selection import cross_val_score

cv = cross_val_score(best_network, X_train_standard, train['epm'], cv = 5, scoring='neg_mean_absolute_error')

print(cv)
print(-1 * np.mean(cv))

[-1.50009743 -1.66622652 -1.48777262 -1.52434942 -1.49198023]
1.534085243052143


In [66]:
#calculate test error
#refit neural network

#QUESTION:
#should I refit here? it uses SGD so slightly different parameters
#Or should I use same model as I used in CV
#If i should use same model as CV caluclation, how do I do it?

best_network.fit(X_train_standard, train['epm'])

MLPRegressor(activation='tanh', alpha=0.001, early_stopping=True,
             hidden_layer_sizes=(50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
                                 50, 50, 50, 50, 50, 50, 50, 50),
             learning_rate='adaptive', max_iter=2000, solver='sgd')

In [67]:
predictions = best_network.predict(X_test_standard)
actual = test['epm']

MAE = sum(abs((predictions - actual))) / len(actual)

MAE

1.6826633532130006

In [128]:
this_year = pd.read_csv('2023predictions.csv', index_col=0)

this_year = this_year[this_year.games_played >= 10]

this_year_selected = this_year[features]

this_year_standard = scaler.transform(this_year_selected) 

In [129]:
predictions_this_year = best_network.predict(this_year_standard)

In [136]:
results = pd.DataFrame(predictions_this_year, columns = ['NN_prediction'])

In [142]:
results['name'] = this_year['name'].values.tolist()

In [145]:
results.sort_values(by = 'NN_prediction', ascending = False).head(20)

Unnamed: 0,NN_prediction,name
12,5.968988,Giannis Antetokounmpo
153,5.900657,Joel Embiid
272,5.488291,Nikola Jokić
410,5.268911,Kristaps Porziņģis
495,5.213865,Karl-Anthony Towns
148,4.64914,Kevin Durant
261,4.175263,LeBron James
86,4.04487,Jimmy Butler
137,3.755698,Luka Dončić
183,3.681685,Rudy Gobert


In [132]:
names.insert(1, "epm", results)

In [133]:
names.reset_index(drop = True)

Unnamed: 0,name,epm
0,Precious Achiuwa,-1.536976
1,Steven Adams,0.030973
2,Bam Adebayo,2.768328
3,Santi Aldama,-2.665683
4,LaMarcus Aldridge,-0.614153
...,...,...
542,Thaddeus Young,-0.698574
543,Trae Young,2.726777
544,Omer Yurtseven,-0.505527
545,Cody Zeller,-1.849830
