In [64]:
import pandas as pd
import numpy as np

In [65]:
data = pd.read_csv('merged_df.csv', index_col=0)

In [66]:
#only players with more than 10 games
data = data[data.games_played >= 10]

In [67]:
#drop duplicates - corresponds to players that were traded mid season
data.drop_duplicates(subset=['name', 'year_end'], inplace=True)

In [68]:
#split into test and train data set

train = data[data.season != 2022]
test = data[data.season == 2022]

In [69]:
#manually choose features to include in model 

features = ['age', 'player_efficiency_rating',
       'true_shooting_percentage', 'three_point_attempt_rate',
       'free_throw_attempt_rate', 'offensive_rebound_percentage',
       'defensive_rebound_percentage', 'total_rebound_percentage',
       'assist_percentage', 'steal_percentage', 'block_percentage',
       'turnover_percentage', 'usage_percentage', 'offensive_win_shares',
       'defensive_win_shares', 'win_shares', 'win_shares_per_48_minutes',
       'offensive_box_plus_minus', 'defensive_box_plus_minus',
       'box_plus_minus']

In [70]:
#store features

X_train = train[features]
X_test = test[features]

**QUESTION**

Saw your note about possible feature pinpointing or reduction

I considered doing PCA but I do not think dimension reductionality is neccessary for computation purposes

However, doing analysis on first x PCA components could be something to throw in just for credit to show we understand material

So I am thinking we could commit to capturing x% of the variance of the data and however many components that corresponds to

Or we could do PCA for some models and not for others..

In [71]:
#standardize data - good for neural networks

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

# Fit only on training data

scaler.fit(X_train)  
X_train_standard = scaler.transform(X_train) 

# apply same transformation to test data

X_test_standard = scaler.transform(X_test) 

In [72]:
#set up grid search for alpha to tune hyperparameter

from sklearn.neural_network import MLPRegressor

parameters = {'alpha': 10.0 ** -np.arange(1, 7)}

network = MLPRegressor(hidden_layer_sizes=(25,25,25),
                       max_iter = 2000,activation = 'tanh',
                       solver = 'sgd',learning_rate='adaptive',
                       early_stopping = True)

parameters

{'alpha': array([1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06])}

In [73]:
#conduct grid search

from sklearn.model_selection import GridSearchCV

gs_neural = GridSearchCV(network,
                      param_grid=parameters,
                      scoring='neg_mean_absolute_error',
                      cv=5, verbose=1)

gs_neural.fit(X_train_standard, train['epm'])
gs_neural.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


{'alpha': 0.1}

In [74]:
#choose tuned parameter

best_network = MLPRegressor(hidden_layer_sizes=(25,25,25),
                       max_iter = 2000,activation = 'tanh',
                       solver = 'sgd',learning_rate='adaptive',
                       alpha = gs_neural.best_params_['alpha'],
                       early_stopping = True)

In [75]:
#cross validation scores on training for model with chosen hyperparameters

from sklearn.model_selection import cross_val_score

cv = cross_val_score(best_network, X_train_standard, train['epm'], cv = 5, scoring='neg_mean_absolute_error')

print(cv)
print(-1 * np.mean(cv))

[-1.53102728 -1.65404699 -1.52410067 -1.48828026 -1.52185443]
1.5438619240197695


In [76]:
#calculate test error
#refit neural network

#QUESTION:
#should I refit here? it uses SGD so slightly different parameters
#Or should I use same model as I used in CV
#If i should use same model as CV caluclation, how do I do it?

best_network.fit(X_train_standard, train['epm'])

MLPRegressor(activation='tanh', alpha=0.1, early_stopping=True,
             hidden_layer_sizes=(25, 25, 25), learning_rate='adaptive',
             max_iter=2000, solver='sgd')

In [88]:
predictions = best_network.predict(X_test_standard)
actual = test['epm']

RSME = (sum((predictions - actual)**2)/len(actual))**0.5

RSME

2.0450988173042917

I think we could use RSME on test data to compare different models and go from there