In [164]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from matplotlib import pyplot as plt
from matplotlib import cm

from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn import metrics
from sklearn.base import BaseEstimator
from sklearn.svm import SVC, SVR 
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler


pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [5]:
class ClfSwitcher(BaseEstimator):

    def __init__(
        self, 
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [2]:
cd /Users/pwalesdi/Desktop/GA/NBA_Player_Prediction/

/Users/pwalesdi/Desktop/GA/NBA_Player_Prediction


In [218]:
%store -r nba
%store -r improvement
%store -r per_improvement
%store -r second_year_prime

In [219]:
# Creating a data set to do future predictions that can't be completely tested
test_master = second_year_prime.loc[(second_year_prime['DRAFT_YEAR+1'] == 2017) | (second_year_prime['DRAFT_YEAR+1'] == 2018) | (second_year_prime['DRAFT_YEAR+1'] == 2016)]
test_master

In [221]:
features = [
            'Pk', 
            'Age', 
            'G', 
            'MP', 
            'TS%', 
            '3PAr', 
            'FTr', 
            'ORB%', 
            'DRB%', 
            'TRB%', 
            'AST%', 
            'BLK%', 
            'TOV%', 
            'USG%', 
#             'OWS', 
#             'DWS', 
            'WS', 
            'WS/48', 
            'OBPM', 
            'DBPM', 
            'BPM', 
            'VORP']

In [262]:
y = second_year_prime['target'] # Setting y --> our target variable
X = second_year_prime[features] # Creating our X variables based on the features we want to use to predict target
ss = StandardScaler() # Instantiate a standard scalar
X_sc = ss.fit_transform(X) # Fit and transform our X variable
ros = RandomOverSampler(random_state=0) # Instantiate a random oversampler in order to oversample our training set
X_resampled, y_resampled = ros.fit_resample(X_sc, y) # Fit that oversampler to our X_sc (scaled) and y data
print(sorted(Counter(y_resampled).items())) # Show the balance btw classes

In [265]:
# Train, test, split based on our oversampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=100, test_size=0.25)

In [266]:
lr = LogisticRegression(solver="liblinear")
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [355]:
grad = GradientBoostingClassifier(n_estimators=100, min_samples_leaf=3, min_samples_split=8, max_depth=3)
grad.fit(X_train, y_train)
print(grad.score(X_train, y_train))
print(grad.score(X_test, y_test))
grad_preds = grad.predict(X_test)
print(confusion_matrix(y_test, # True values.
                 grad_preds))  # Predicted values.
tn, fp, fn, tp = confusion_matrix(y_test, grad_preds).ravel()
print(f1_score(y_test, grad_preds))

1.0
0.9642857142857143
[[108   8]
 [  0 108]]
0.9642857142857143


In [365]:
confusion_matrix(y_test, grad_preds)[1][1]

108

In [267]:
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.8839285714285714
0.8348214285714286


In [353]:
preds = lr.predict(X_test)
print(confusion_matrix(y_test, # True values.
                 preds))  # Predicted values.
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
print(f1_score(y_test, preds))

[[89 27]
 [10 98]]
0.8412017167381974


In [270]:
print(lr.coef_)
print(lr.intercept_)
lr.predict(X_sc)

[[ 0.29636319 -0.50951308  0.72382913 -0.01777492 -0.40029353 -0.22276729
   0.79485604 -0.39730193  0.51696164  0.21754193 -0.09049999  1.06867817
   0.22084165  0.9608816   0.64739776  0.03424479  1.11486323 -0.48739434
   0.68627706  0.18110952]]
[-2.20897742]


0.2510204081632653

In [340]:
master = test_master[features]
print(master.shape)
master_sc = ss.fit_transform(master)
grad.predict(master_sc)

(134, 20)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [341]:
eighteen_nineteen = pd.DataFrame(grad.predict(master_sc), columns=['predicted'], index=master.index)
eighteen_nineteen[['Player_name', 'DRAFT_YEAR+1	']] = test_master[['Player_name', 'DRAFT_YEAR+1']]
eighteen_nineteen.sort_values(by='predicted', ascending=False)

Unnamed: 0,predicted,Player_name,DRAFT_YEAR+1
1610,1,Kristaps Porzingis,2016.0
1436,1,Willy Hernangomez,2016.0
1145,1,Domantas Sabonis,2017.0
469,1,Donovan Mitchell,2018.0
1448,1,Richaun Holmes,2016.0
140,1,Zach Collins,2018.0
1068,1,Jamal Murray,2017.0
222,1,De'Aaron Fox,2018.0
1422,1,Montrezl Harrell,2016.0
1300,1,Devin Booker,2016.0


In [256]:
results = pd.DataFrame(lr.predict(X_test), columns=['predicted'])

# Create column for observed values.
y_test = y_test.reset_index()
results['actual'] = y_test['target']
results['index1'] = y_test['index']
results.set_index('index1', inplace=True)

In [257]:
row_ids = results[results['predicted'] != results['actual']].index

In [258]:
row_ids

Int64Index([4, 10, 222, 432, 469, 1068, 1114, 1157, 1436, 1683, 1892, 2100, 2188, 2336, 2576, 2743, 2902, 2946, 3116, 3251, 3515, 3599, 3600, 4177, 4243, 4268, 4337, 4362, 4619, 4702, 4727, 4743, 4859, 4955, 4973, 5217, 5238, 5334, 5415, 5429, 5503, 5605, 5674, 5792, 6085], dtype='int64', name='index1')

In [259]:
results["Player_name"] = ''
for i in row_ids:
    results["Player_name"][i] = (second_year_prime["Player_name"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [296]:
results.sort_values(by=["actual"], ascending=False)

In [334]:
lr_class = LogisticRegression(penalty='l1', C=40, solver='liblinear')
knn_class = KNeighborsClassifier(n_neighbors=3, p=4, leaf_size=10)
tree_class = DecisionTreeClassifier(max_features='auto', min_samples_leaf=3, min_samples_split=4, random_state=100)
bag_class = BaggingClassifier(bootstrap=False, max_features=8, max_samples=100, n_estimators=100, random_state=100)
forest_class = RandomForestClassifier(bootstrap=True, max_leaf_nodes=None, min_samples_leaf=3, min_samples_split=8, n_estimators=9, random_state=100)
ada_class = AdaBoostClassifier(learning_rate=0.78, n_estimators=100, random_state=100)
svc = SVC(degree=8, C=2.5, gamma=0.1, kernel='poly', random_state=100)
grad_class = GradientBoostingClassifier(n_estimators=100, min_samples_leaf=3, min_samples_split=8, max_depth=3)

In [335]:
class_models = {
            'lr_class': lr_class, 
            'forest_class': forest_class, 
            'tree_class': tree_class,
            'ada_class': ada_class,            
            'knn_class': knn_class, 
            'bag_class': bag_class, 
            'svc': svc,
            "grad": grad_class
                } 
# models_c = [lr_c, knn_c, tree_c, bag_c, forest_c, ada_c, svc]
# model_c_names = ['lr_c', 'knn_c', 'tree_c', 'bag_c', 'forest_c', 'ada_c', 'svc']

In [347]:
y_pred_testc = []
y_pred_trainc = []

for model in class_models.values():
    model.fit(X_train, y_train)
    y_pred_testc.append(model.predict(X_test))
    y_pred_trainc.append(model.predict(X_train))

y_pred_testc_df = pd.DataFrame(y_pred_testc, index=class_models.keys()).T
y_pred_trainc_df = pd.DataFrame(y_pred_trainc, index=class_models.keys()).T
print(y_pred_testc_df.shape)
print(y_pred_trainc_df.shape)

(224, 8)
(672, 8)


In [379]:
accuracy = {'train': [], 'test': [], 'F1-train': [], 'F1-test': [], 'true_neg': [], 'fal_pos': [], 'fal_neg': [], 'true_po': []}
for model in class_models.values():
    accuracy['train'].append(model.score(X_train, y_train))
    accuracy['test'].append(model.score(X_test, y_test))
for col in y_pred_testc_df:
    accuracy['F1-train'].append(f1_score(y_train, y_pred_trainc_df[col]))
    accuracy['F1-test'].append(f1_score(y_test, y_pred_testc_df[col]))
for col in y_pred_testc_df:
    accuracy['true_neg'].append(confusion_matrix(y_test, y_pred_testc_df[col])[0][0])
    accuracy['fal_pos'].append(confusion_matrix(y_test, y_pred_testc_df[col])[0][1])
    accuracy['fal_neg'].append(confusion_matrix(y_test, y_pred_testc_df[col])[1][0])
    accuracy['true_po'].append(confusion_matrix(y_test, y_pred_testc_df[col])[1][1])
    
accuracy_df = pd.DataFrame(accuracy, index=class_models.keys())
accuracy_df

Unnamed: 0,train,test,F1-train,F1-test,true_neg,fal_pos,fal_neg,true_po
lr_class,0.88244,0.839286,0.889819,0.844828,90,26,10,98
forest_class,0.982143,0.959821,0.982659,0.96,107,9,0,108
tree_class,0.980655,0.933036,0.981241,0.935065,101,15,0,108
ada_class,0.998512,0.964286,0.998532,0.964286,108,8,0,108
knn_class,0.965774,0.919643,0.967283,0.923077,98,18,0,108
bag_class,0.940476,0.933036,0.944444,0.935065,101,15,0,108
svc,0.99256,0.928571,0.992593,0.928571,104,12,4,104
grad,1.0,0.964286,1.0,0.964286,108,8,0,108


In [414]:
master = test_master[features]
master_sc = ss.fit_transform(master)
predicted_player = pd.DataFrame(index=master.index)
predicted_player[['Player_name', 'DRAFT_YEAR+1']] = test_master[['Player_name', 'DRAFT_YEAR+1']]
for (model_name, model) in class_models.items():
    predicted_player[model_name] = model.predict(master_sc)
predicted_player['tot'] = predicted_player['lr_class'] + predicted_player['forest_class'] + predicted_player['tree_class'] + predicted_player['ada_class'] + predicted_player['knn_class'] + predicted_player['bag_class'] + predicted_player['svc'] + predicted_player['grad'] 
predicted_player.sort_values(by='tot', ascending=False)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [30]:
params = {
    'max_depth' : [3, 4],
    'min_samples_leaf' : [2, 3],
    'min_samples_split' : [6, 7, 8],
    'n_estimators' : [30, 35, 37],
    "learning_rate" : (np.logspace(-1.6, -1, 20))
}
gs = GridSearchCV(
    GradientBoostingClassifier(),
    params,
    cv=3,
    verbose=1,
    return_train_score=False,
    n_jobs=2)
gs.fit(X_train, y_train)
print(gs.best_score_)
print()
print(gs.best_params_)
print()
print(gs.score(X_test, y_test))
pred = model.predict(X_test)
f1_score(y_test, pred)

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 340 tasks      | elapsed:    5.0s
[Parallel(n_jobs=2)]: Done 1540 tasks      | elapsed:   22.7s


0.9209809264305178

{'learning_rate': 0.05589441576403378, 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 37}

0.8943089430894309


[Parallel(n_jobs=2)]: Done 2160 out of 2160 | elapsed:   32.6s finished


0.26666666666666666