# Import the packages, algorithms and metrics

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Load in data and create dataframe

In [None]:
df = pd.read_csv('../input/nba-stats-20162019-seasons/2016-2019_total stats.csv')
print(df.shape, df.columns.to_list())

#Checking out the data
pd.set_option("display.max.columns", None)
df.head(10)

In [None]:
#Checking out the data

df.info()

# Visualizing the data

In [None]:
#Import matplotlib

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
PTS_column = df["PTS"]
TRB_column = df["TRB"]
AST_column = df["AST"]
WS_Column = df["WS"]

In [None]:
# Points Histogram
# Right skew here with majority of players scoring around 5 points per game
PTS_column.plot(kind="hist")

In [None]:
# Rebounds Histogram
# Right skew here with majority of players getting between 2-4 rebounds a game

TRB_column.plot(kind="hist")

In [None]:
# Assists Histogram
# Right skew here with majority of players getting between 0-2 assists a game

AST_column.plot(kind="hist")

In [None]:
# Win shares Histogram
WS_Column.plot(kind="hist")

In [None]:
#Players with highest points per game seasons between 2016-2019

#James Harden
#James Harden
#Russell Westbrook
#Bradley Beal
#James Harden

top_5 = df.sort_values(by="PTS", ascending=False).head()

top_5.plot(x="Player", y="PTS", kind="bar", rot=10, fontsize=8.5)

In [None]:
#Players with highest assists per game seasons between 2016-2019

#James Harden
#Russell Westbrook
#John Wall
#Russell Westbrook
#Russell Westbrook

top_5ast = df.sort_values(by="AST", ascending=False).head()

top_5ast.plot(x="Player", y="AST", kind="bar", rot=10, fontsize=8.5)

In [None]:
#Players with highest rebounds per game seasons between 2016-2019

#Andre Drummond
#Andre Drummond
#Andre Drummond
#DeAndre Jordan
#Hassan Whiteside

top_5reb = df.sort_values(by="TRB", ascending=False).head()

top_5reb.plot(x="Player", y="TRB", kind="bar", rot=10, fontsize=8.5)

In [None]:
#Players with highest win shares between 2016-2019

#James Harden
#James Harden
#James Harden
#Rudy Gobert
#Giannis Antetokounmpo

top_5W = df.sort_values(by="WS", ascending=False).head()

top_5W.plot(x="Player", y="WS", kind="bar", rot=10, fontsize=8.5)

In [None]:
#Correlation between usage rate and points scored
#Fairly strong correlation

df.plot(x="USG%", y="PTS", kind="scatter")

In [None]:
#Correlation between 3 points attempted per game and points scored
#Stronger correlation than usage rate

df.plot(x="3PA", y="PTS", kind="scatter")

In [None]:
#Correlation between 3 points made and points scored
#Strong correlation

df.plot(x="3P", y="PTS", kind="scatter")

In [None]:
#Correlation between win shares and points scored
#Strong correlation

df.plot(x="WS", y="PTS", kind="scatter")

# Replace null/blank values with 0

In [None]:
df.fillna(0, inplace=True)

# Putting all the predictive features in a list (xcols)

In [None]:
xcols = df.columns[7 : 35].to_list()
print(xcols,'\n')

# Train-test split

## Split the data into training (80%) and testing (20%) portions

#### Notes: train_test_split shuffles the data prior to splitting, so should be randomized
#### The 'random_state' feature ensures that you get the same split of the data every time

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[xcols], df['All-Star'], 
                                                    train_size=0.8, random_state=1)
print('training data:', X_train.shape)
print('test data:', X_test.shape)

# Logistic regression part 1 using all of the data

In [None]:
#Pretty good accuracy although there is a slight indication of overfitting

log_reg = LogisticRegression(solver='lbfgs', max_iter=3000)

# Fit the model to the training data
clf = log_reg.fit(X_train, y_train)

# Get accuracy stats
print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)))

# Exploring the coefficients

In [None]:
# Put the coefficients into a new dataframe
coef = pd.concat([pd.DataFrame(xcols),pd.DataFrame(np.transpose(clf.coef_))], axis = 1)
coef.columns = ['feature','coefficient']
coef.sort_values(by=['coefficient'], ascending=False, inplace=True)

# Examine the features/stats with the 10 largest positive and negative coefficients
print('Ten largest positive features:\n', coef.head(10), '\n')
print('Ten largest negative features:\n', coef.tail(10))

#The coefficients here show the best weights to maximize the 1s and minimize the 0s.

In [None]:
#Adding the features with the 10 largest positive and negative coefficients to a new list for training purposes

xcols2 = coef.feature[0:10].to_list()
xcols2 += coef.feature[-10:].to_list()
print(xcols2)

# Logistic regression part 2 using only the list of features with the largest coefficients (xcols2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[xcols2], df['All-Star'], 
                                                    train_size=0.8, random_state=1)
print('training data:', X_train.shape)
print('test data:', X_test.shape)

# fit the model to the training data
clf2 = log_reg.fit(X_train, y_train)

# get accuracy stats
print('training accuracy: {}'.format(clf2.score(X_train, y_train).round(3)))
print('test accuracy: {}'.format(clf2.score(X_test, y_test).round(3)))

# Model Evaluation

In [None]:
# Regularization w/ logistic regression & C parameter with logistic regression

cset = [.001, .01, .1, 1, 10]
for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver='lbfgs', max_iter=3000, C=i)
    clf = log_reg.fit(X_train, y_train)
    print('training accuracy: {}'.format(clf.score(X_train, y_train).round(3)))
    print('test accuracy: {}'.format(clf.score(X_test, y_test).round(3)), '\n')

In [None]:
# Cross-validation w/ tuning regularization in logistic regression

for i in cset:
    print('C =', i)
    log_reg = LogisticRegression(solver='lbfgs', max_iter=3000, C=i)
    scores = cross_val_score(log_reg, df[xcols2], df['All-Star'], cv=5)
    print(scores)
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2), '\n')

In [None]:
# Let's check out some other stats for the test data
# Pretty good recall, precision, F1 score and specificity overall

y_pred_test = clf.predict(X_test)
print('  Recall: {:.3f}'.format(recall_score(y_test, y_pred_test)))
print('  Precision: {:.3f}'.format(precision_score(y_test, y_pred_test)))
print('  F1 score: {:.3f}'.format(f1_score(y_test, y_pred_test)))
cm = confusion_matrix(y_test, y_pred_test)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn+fp)
print('  Specificity: {:.3f}'.format(specificity))

In [None]:
# Confusion matrix

cm = confusion_matrix(y_test, y_pred_test)
sample = np.array([['TN', 'FP'], ['FN', 'TP']])
print('CM key:\n', sample, '\n')
print('CM for test:\n', cm)

# Implementing Grid search attempting to to compute the optimum values of hyperparameters

In [None]:
# Grid search

# Define x_scaler

scaler = MinMaxScaler()

X = df[xcols2]
scaled_X = scaler.fit_transform(X)
y = df['All-Star']

In [None]:
# Grid search

# Positive results

scaled_X = scaler.fit_transform(X_train)

tuned_parameters = {'C': [0.1, 0.5, 1, 5, 10, 50, 100]}

grid = GridSearchCV(LogisticRegression(solver='liblinear'), tuned_parameters, cv=3, scoring="accuracy")

grid.fit(scaled_X, y_train)

print('mean of accuracies:', grid.cv_results_['mean_test_score'])
print('std dev of accuracies:', grid.cv_results_['std_test_score'])

# print best parameter after tuning 
print('best parameters:', grid.best_params_) 

# store the best estimator (In this case it's 'C': 100)
best_logreg = grid.best_estimator_

# Ensemble Model: Random Forests

In [None]:
#Bagging with random forests

# set-up grid of parameters to search
param_grid = {'n_estimators': [10, 100, 250], 'max_samples': [.25, .5, 1]} 


# instantiate grid search object
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv = 3)

# fitting the model for grid search 
grid.fit(X, y)

# print parameters, mean, and standard deviation of scores by iteration
for z in range(0, len(grid.cv_results_['params'])):
    print('\nparams:', grid.cv_results_['params'][z])
    print('mean of accuracies:', grid.cv_results_['mean_test_score'][z])
    print('std dev of accuracies:', grid.cv_results_['std_test_score'][z])

# print best parameter after tuning 
print('\n***best parameters:', grid.best_params_)
print('best score:', grid.best_score_)

# store the best estimator (in this case it's 'max_samples': 0.5, 'n_estimators': 100)
best_rf = grid.best_estimator_

# Let's Start Predicting (Using the Logistic Regression Model due to it having slightly more accuracy)

In [None]:
#Bringing in new data for this past season (2020-2021)
#Cheers to Basketball Reference for having the data, just needed to combine them

mydata = pd.read_csv('../input/20202021-season-stats/2020_2021_season.csv')
print(mydata.shape, mydata.columns.to_list(),'\n')
mydata

# Once again, replace null/blank values with 0

In [None]:
mydata.fillna(0, inplace=True)

# Creating a dataset with only the 10 negative/positive features that we discovered earlier

In [None]:
mydata2 = mydata[['BLK', 'DRB', 'WS', 'STL', 'AST', 'USG%', '3P', '3P%', 'MP', '2PA', 'TOV', 'TRB', 'FGA', 'FTA', 'FT', '3PA', 'PER', 'PF', '2P', 'ORB']]

In [None]:
mydata2

In [None]:
xcols3 = mydata2.columns.to_list()
print(xcols3,'\n')

# Predicting the 2020-2021 all-stars with the data

In [None]:
pred = clf.predict(mydata[xcols3])

In [None]:
print(pred)

# Adding the All-Star information back to the dataset

In [None]:
mydata['All-Star'] = pred.tolist()

In [None]:
print(mydata)

# Final Results

## 2020 All-Stars from my model:
#### Giannis Antetokounmpo
#### Stephen Curry
#### Luka Doncic
#### Kevin Durant
#### Joel Embiid
#### Paul George
#### James Harden
#### Kyrie Irving
#### LeBron James
#### Nikola Jokic
#### Zach LaVine
#### Kawhi Leonard
#### Damian Lillard
#### Donovan Mitchell
#### Chris Paul
#### Julius Randle
#### Domantas Sabonis
#### Jayson Tatum
#### Nikola Vucevic
#### Russell Westbrook
#### Trae Young

## 2020 All-Stars in reality:
#### Giannis Antetokounmpo
#### Stephen Curry
#### Luka Doncic
#### Kevin Durant
#### Joel Embiid
#### Paul George
#### James Harden
#### Kyrie Irving
#### LeBron James
#### Nikola Jokic
#### Zach LaVine
#### Kawhi Leonard
#### Damian Lillard
#### Donovan Mitchell
#### Chris Paul
#### Julius Randle
#### Domantas Sabonis
#### Jayson Tatum
#### Nikola Vucevic
#### Jaylen Brown
#### Rudy Gobert
#### Ben Simmons
#### Bradley Beal
#### Mike Conley
#### Trae Young