In [19]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.svm import SVC, SVR 
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler 
from collections import Counter
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
cd /Users/pwalesdinan/Desktop/GA/NBA_Player_Prediction/Notebooks/

/Users/pwalesdinan/Desktop/GA/NBA_Player_Prediction/Notebooks


In [3]:
%store -r nba
%store -r first_second_seasons_agg

In [4]:
first_second_seasons_agg.shape

(621, 41)

In [5]:
# Creating a data set to do future predictions that can't be completely tested
test_master_agg = first_second_seasons_agg.loc[(first_second_seasons_agg['DRAFT_YEAR+1'] == 2017) | (first_second_seasons_agg['DRAFT_YEAR+1'] == 2018) | (first_second_seasons_agg['DRAFT_YEAR+1'] == 2016)]
test_master_agg.head(3)

Unnamed: 0,Player_name,player_id,Tm_x,DRAFT_YEAR+1,Draft_team,Pk,Pos,Age,College,Yrs,draft_round,target,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,WS/48,OBPM,DBPM,BPM,FG%,3P%,FT%,PPG,RPG,APG,G,MP,VORP,PTS,TRB,AST,WS
0,A.J. Hammons,hammoaj01,DAL,2017.0,DAL,46.0,C,24,Purdue University,1.0,2.0,0,8.4,0.472,0.238,0.476,5.4,21.0,12.8,3.8,0.3,7.2,16.4,17.6,-0.001,-7.5,2.0,-5.6,0.405,0.5,0.45,2.2,1.6,0.2,22,163,-0.1,48.0,36.0,4.0,0.0
6,Abdel Nader,naderab01,BOS,2017.0,BOS,58.0,SF,24,Iowa State University,2.0,2.0,0,5.1,0.439,0.436,0.262,2.8,12.0,7.4,7.1,1.4,1.6,17.0,17.1,-0.014,-5.9,0.3,-5.6,0.387,0.333,0.667,3.6,1.7,0.4,48,522,-0.5,387.0,187.0,46.0,-0.1
15,Alec Peters,peteral01,PHO,2018.0,PHO,54.0,PF,22,Valparaiso University,1.0,2.0,0,10.7,0.523,0.784,0.135,3.7,14.0,8.8,7.9,0.4,0.7,4.9,15.5,0.055,-0.2,-3.1,-3.3,0.378,0.31,0.8,4.1,1.9,0.6,20,225,-0.1,82.0,37.0,12.0,0.3


In [6]:
first_second_seasons_agg_test = first_second_seasons_agg.loc[~first_second_seasons_agg["player_id"].isin(test_master_agg['player_id'])]
first_second_seasons_agg_test.sort_values("DRAFT_YEAR+1", ascending=False).shape

(475, 41)

In [7]:
features = [
            'Pk', 
            'Age', 
            'G', 
            'MP', 
            'TS%', 
            '3PAr', 
            'FTr', 
            'ORB%', 
            'DRB%', 
            'TRB%', 
            'AST%', 
            'BLK%', 
            'TOV%', 
            'USG%', 
            'WS', 
            'WS/48', 
            'OBPM', 
            'DBPM', 
            'BPM', 
            'VORP',
            'RPG',
            'PPG',
            'APG',
            'FT%',
            '3P%',
            'PER'
]

In [8]:
y = first_second_seasons_agg['target'] # Setting y --> our target variable
X = first_second_seasons_agg[features] # Creating our X variables based on the features we want to use to predict target
ss = StandardScaler() # Instantiate a standard scalar
X_sc = ss.fit_transform(X) # Fit and transform our X variable
ros = RandomOverSampler(random_state=0) # Instantiate a random oversampler in order to oversample our training set
X_resampled, y_resampled = ros.fit_resample(X_sc, y) # Fit that oversampler to our X_sc (scaled) and y data
print(sorted(Counter(y_resampled).items())) # Show the balance btw classes
# Train, test, split based on our oversampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=100, test_size=0.25)
X.shape

[(0, 579), (1, 579)]


(621, 26)

In [9]:
# Instantiating all of our different classification models
lr_class = LogisticRegression(penalty='l1', C=40, solver='liblinear')
knn_class = KNeighborsClassifier(n_neighbors=3, p=4, leaf_size=10)
tree_class = DecisionTreeClassifier(max_features='auto', min_samples_leaf=3, min_samples_split=4, random_state=100)
bag_class = BaggingClassifier(bootstrap=False, max_features=8, max_samples=100, n_estimators=100, random_state=100)
forest_class = RandomForestClassifier(bootstrap=True, max_leaf_nodes=None, min_samples_leaf=3, min_samples_split=8, n_estimators=9, random_state=100)
ada_class = AdaBoostClassifier(learning_rate=0.78, n_estimators=100, random_state=100)
svc = SVC(degree=8, C=1, gamma=1, kernel='rbf', random_state=100)
grad_class = GradientBoostingClassifier(n_estimators=100, min_samples_leaf=3, min_samples_split=8, max_depth=3)

In [10]:
class_models = {
            'lr_class': lr_class, 
            'forest_class': forest_class, 
            'tree_class': tree_class,
            'ada_class': ada_class,            
            'knn_class': knn_class, 
            'bag_class': bag_class, 
            'svc': svc,
            "grad": grad_class
                } 

In [11]:
y_pred_testc = []
y_pred_trainc = []

for model in class_models.values():
    model.fit(X_train, y_train)
    y_pred_testc.append(model.predict(X_test))
    y_pred_trainc.append(model.predict(X_train))

y_pred_testc_df = pd.DataFrame(y_pred_testc, index=class_models.keys()).T
y_pred_trainc_df = pd.DataFrame(y_pred_trainc, index=class_models.keys()).T
print(y_pred_testc_df.shape)
print(y_pred_trainc_df.shape)

(290, 8)
(868, 8)


In [12]:
accuracy = {'train': [], 'test': [], 'F1-train': [], 'F1-test': [], 'true_neg': [], 'fal_pos': [], 'fal_neg': [], 'true_po': []}
for model in class_models.values():
    accuracy['train'].append(model.score(X_train, y_train))
    accuracy['test'].append(model.score(X_test, y_test))
for col in y_pred_testc_df:
    accuracy['F1-train'].append(f1_score(y_train, y_pred_trainc_df[col]))
    accuracy['F1-test'].append(f1_score(y_test, y_pred_testc_df[col]))
for col in y_pred_testc_df:
    accuracy['true_neg'].append(confusion_matrix(y_test, y_pred_testc_df[col])[0][0])
    accuracy['fal_pos'].append(confusion_matrix(y_test, y_pred_testc_df[col])[0][1])
    accuracy['fal_neg'].append(confusion_matrix(y_test, y_pred_testc_df[col])[1][0])
    accuracy['true_po'].append(confusion_matrix(y_test, y_pred_testc_df[col])[1][1])
    
accuracy_df = pd.DataFrame(accuracy, index=class_models.keys())
accuracy_df

Unnamed: 0,train,test,F1-train,F1-test,true_neg,fal_pos,fal_neg,true_po
lr_class,0.918203,0.893103,0.920135,0.888889,135,19,12,124
forest_class,0.99424,0.975862,0.994388,0.97491,147,7,0,136
tree_class,0.991935,0.958621,0.992161,0.957746,142,12,0,136
ada_class,1.0,0.982759,1.0,0.981949,149,5,0,136
knn_class,0.971198,0.944828,0.972558,0.944444,138,16,0,136
bag_class,0.948157,0.931034,0.951665,0.931507,134,20,0,136
svc,1.0,1.0,1.0,1.0,154,0,0,136
grad,1.0,0.982759,1.0,0.981949,149,5,0,136


In [13]:
master = test_master_agg[features]
master_sc = ss.fit_transform(master)
predicted_player = pd.DataFrame(index=master.index)
predicted_player[['Player_name', 'DRAFT_YEAR+1']] = test_master_agg[['Player_name', 'DRAFT_YEAR+1']]
for (model_name, model) in class_models.items():
    predicted_player[model_name] = model.predict(master_sc)
predicted_player['tot'] = predicted_player['lr_class'] + predicted_player['forest_class'] + predicted_player['tree_class'] + predicted_player['ada_class'] + predicted_player['knn_class'] + predicted_player['bag_class'] + predicted_player['svc'] + predicted_player['grad'] 
# predicted_player.to_csv('predict_15_16_17_based_on_2nd_yr')

In [14]:
%store predicted_player

Stored 'predicted_player' (DataFrame)


In [15]:
predicted_player.sort_values(by='tot', ascending=False)

Unnamed: 0,Player_name,DRAFT_YEAR+1,lr_class,forest_class,tree_class,ada_class,knn_class,bag_class,svc,grad,tot
363,Kristaps Porzingis,2016.0,1,1,1,1,1,1,1,1,8
438,Montrezl Harrell,2016.0,1,1,0,1,1,1,1,1,7
495,Richaun Holmes,2016.0,0,1,1,1,1,1,1,1,7
378,Lauri Markkanen,2018.0,1,1,1,1,1,1,0,1,7
156,Domantas Sabonis,2017.0,0,1,0,1,1,1,1,1,6
147,Devin Booker,2016.0,1,1,0,0,1,1,1,1,6
43,Ben Simmons,2017.0,1,1,0,1,1,1,0,1,6
265,Jarrett Allen,2018.0,1,1,0,1,1,1,0,1,6
271,Jayson Tatum,2018.0,0,1,1,1,0,1,0,1,5
442,Myles Turner,2016.0,1,1,0,1,0,1,0,1,5


In [16]:
params = {
#     'max_depth' : [3, 4],
#     'min_samples_leaf' : [2, 3],
#     'min_samples_split' : [6, 7, 8],
#     'n_estimators' : [30, 35, 37],
#     "learning_rate" : (np.logspace(-1.6, -1, 20))
    'C' : np.logspace(-3, 3, 7),
    'gamma' : np.logspace(-3, 3, 7)
}
gs = GridSearchCV(
    SVC(),
    params,
    cv=3,
    verbose=1,
    return_train_score=False,
    n_jobs=2)
gs.fit(X_train, y_train)
print(gs.best_score_)
print()
print(gs.best_params_)
print()
print(gs.score(X_test, y_test))
pred = model.predict(X_test)
f1_score(y_test, pred)

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


1.0

{'C': 1.0, 'gamma': 1.0}

1.0


[Parallel(n_jobs=2)]: Done 147 out of 147 | elapsed:    2.3s finished


0.9819494584837545

In [28]:
model = Sequential()
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model.compile(loss='binary_crossentropy',
          optimizer= "adam",
          metrics=[f1])
#######

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('acc') > .999:
            self.model.stop_training=True

earlystop = myCallback()
reduce_lr = ReduceLROnPlateau(monitor='loss', patience=2, factor=0.3, min_lr=0.00000001, verbose=1)

epochs=200
model = Sequential()

model.add(Dense(256, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dropout(rate=0.5))
model.add(Dense(128, activation="relu"))
model.add(Dropout(rate=0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(rate=0.5))
model.add(Dense(32, activation="relu"))
model.add(Dropout(rate=0.5))
model.add(Dense(1, activation="sigmoid"))

model.compile(
#     optimizer="adam",
    optimizer=Adam(lr=0.003),
    loss="binary_crossentropy",
    metrics=["acc", f1])

history = model.fit(
    X_train, y_train,
    batch_size=100,
    epochs=epochs,
    validation_data=(X_test, y_test),
    callbacks = [reduce_lr, earlystop],
#     verbose=0,
)

InvalidArgumentError: Invalid reduction dimension (1 for input with 1 dimension(s) [Op:Sum]

In [None]:
confusion_matrix(y_test, model.predict_classes(X_test))

In [None]:
master = test_master_agg[features]
master_nn = ss.fit_transform(master)
predicted_player_nn = pd.DataFrame(index=master.index)
predicted_player_nn[['Player_name', 'DRAFT_YEAR+1']] = test_master_agg[['Player_name', 'DRAFT_YEAR+1']]
predicted_player_nn['neural'] = model.predict_classes(master_nn)
predicted_player_nn.sort_values(by=['neural', "DRAFT_YEAR+1"], ascending=(False, False)).head(30)

In [None]:
import matplotlib.pyplot as plt
ticks_1 = (np.linspace(0, epochs-1, 20).astype(int))
ticks_2 = (np.linspace(1, epochs, 20).astype(int))

# Check out our train loss and test loss over epochs.
train_loss = history.history['loss']
test_loss = history.history['val_loss']

# Set figure size.
plt.figure(figsize=(12, 8))

# Generate line plot of training, testing loss over epochs.
plt.plot(train_loss, label='Training Loss', color='#185fad')
plt.plot(test_loss, label='Testing Loss', color='orange')

# Set title
plt.title('Training and Testing Loss by Epoch', fontsize = 25)
plt.xlabel('Epoch', fontsize = 18)
plt.ylabel('Categorical Crossentropy', fontsize = 18)
plt.xticks(ticks_1, ticks_2)
plt.legend(fontsize = 18);

In [None]:
ticks_1 = (np.linspace(0, epochs-1, 20).astype(int))
ticks_2 = (np.linspace(1, epochs, 20).astype(int))

# Check out our train accuracy and accuracy loss over epochs.
train_loss = history.history['acc']
test_loss = history.history['val_acc']

# Set figure size.
plt.figure(figsize=(12, 8))

# Generate line plot of training, testing accuracy over epochs.
plt.plot(train_loss, label='Training Accuracy', color='#185fad')
plt.plot(test_loss, label='Testing Accuracy', color='orange')

# Set title
plt.title('Training and Testing Accuracy by Epoch', fontsize = 25)
plt.xlabel('Epoch', fontsize = 18)
plt.ylabel('Categorical Crossentropy', fontsize = 18)
plt.xticks(ticks_1, ticks_2)
plt.legend(fontsize = 18);

In [None]:
ticks_1 = (np.linspace(0, epochs-1, 20).astype(int))
ticks_2 = (np.linspace(1, epochs, 20).astype(int))

# Check out our train accuracy and accuracy loss over epochs.
train_loss = history.history['f1']
test_loss = history.history['val_f1']

# Set figure size.
plt.figure(figsize=(12, 8))

# Gaenerate line plot of training, testing accuracy over epochs.
plt.plot(train_loss, label='Training F1-Score', color='#185fad')
plt.plot(test_loss, label='Testing F1-Score', color='orange')

# Set title
plt.title('Training and Testing F1-Score by Epoch', fontsize = 25)
plt.xlabel('Epoch', fontsize = 18)
plt.ylabel('Categorical Crossentropy', fontsize = 18)
plt.xticks(ticks_1, ticks_2)
plt.legend(fontsize = 18);

In [None]:
test_master_w_predictions = pd.merge(test_master, predicted_player_nn, left_on='Player_name', right_on='Player_name')
test_master_w_predictions = pd.merge(test_master_w_predictions, predicted_player, left_on='Player_name', right_on='Player_name')
test_master_w_predictions.drop(columns=['lr_class', 'lr_class', 'tree_class', 'ada_class', 'knn_class', 'bag_class', 'svc', 'grad', 'forest_class', 'DRAFT_YEAR+1_x', 'DRAFT_YEAR+1_y'], inplace=True)
test_master_w_predictions.sort_values(by=['target','neural'], ascending=(False, False))

In [None]:
['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
             'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
             'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
             'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
             'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
             'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
             'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
             'orrd', 'oryel', 'peach', 'phase', 'picnic', 'pinkyl', 'piyg',
             'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn', 'puor',
             'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu', 'rdgy',
             'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar', 'spectral',
             'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn', 'tealrose',
             'tempo', 'temps', 'thermal', 'tropic', 'turbid', 'twilight',
             'viridis', 'ylgn', 'ylgnbu', 'ylorbr', 'ylorrd']



In [None]:
fig = px.scatter_3d(test_master_w_predictions, x='WS', y='VORP', z='MP',
            color='neural', 
            hover_name='Player_name', 
            hover_data=['USG%', 'PER', 'MP', 'VORP', 'Age'], 
            symbol='neural')
fig.update_traces(marker=dict(size=10,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

# fig.update_layout(scene=dict(xaxis=dict(range=[-15,10],),))
fig.update_layout(legend_orientation="h")
fig.show()
plotly.offline.plot(fig, filename='html_files/aggregated_nn_ws_vorp_MP.html')

In [None]:
fig = px.scatter_3d(test_master_w_predictions, x='WS', y='VORP', z='RPG',
            color='neural', 
            hover_name='Player_name', 
            hover_data=['USG%', 'PER', 'MP', 'VORP', 'Age'], 
            symbol='neural')
fig.update_traces(marker=dict(size=10,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

# fig.update_layout(scene=dict(xaxis=dict(range=[-15,10],),))
fig.update_layout(legend_orientation="h")
fig.show()
# plotly.offline.plot(fig, filename='html_files/aggregated_nn_ws_vorp_MP.html')

In [None]:
fig = px.scatter(test_master_w_predictions, x='VORP', y='WS',
            color='neural', hover_name='Player_name', hover_data=['USG%', 'PER', 'MP', 'VORP', 'Age'], 
#             color_continuous_scale='rdbu',
            symbol='neural')
fig.update_traces(marker=dict(size=10,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
# fig.update_xaxes(range=[-2, 7])
fig.update_layout(legend_orientation="h")
fig.show()
plotly.offline.plot(fig, filename='html_files/aggregated_ws_vorp.html')

In [None]:
fig = px.scatter(test_master_w_predictions, x='TS%', y='USG%',
                 color='neural',
                 hover_name='Player_name',
                 hover_data=['USG%', 'PER', 'MP', 'VORP', 'Age', 'RPG', 'VORP', 'WS'], 
                 color_continuous_scale=px.colors.diverging.PiYG,
                 symbol='neural')
fig.update_traces(marker=dict(size=10,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_layout(legend_orientation="h")
# fig.update_xaxes(range=[-2, 4.5])
fig.show()
plotly.offline.plot(fig, filename='html_files/aggregated_usg_ts%.html')

In [None]:
fig = px.scatter(test_master_w_predictions, x='OWS', y='DWS',
                 color='neural',
                 hover_name='Player_name',
                 hover_data=['USG%', 'PER', 'MP', 'VORP', 'Age', 'RPG', 'VORP', 'WS'], 
                 color_continuous_scale=px.colors.diverging.RdBu,
                 symbol='neural')
fig.update_traces(marker=dict(size=10,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_layout(legend_orientation="h")
# fig.update_xaxes(range=[-2, 4.5])
fig.show()
plotly.offline.plot(fig, filename='html_files/aggregated_OWS_DWS.html')

In [None]:
test_master_w_predictions.corr()[['tot', 'neural']].sort_values('neural', ascending=False)