In [62]:
#############################################################
#
# NBA Individual Player Performance Prediction
#
#############################################################
import sys, os
sys.path.append("../..")
sys.path.append("..")
sys.path.append(os.getcwd())

from matplotlib import pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
import copy
import pickle

plt.rcParams.update({'font.size': 14})

In [63]:
def get_gamma(y, x, alpha, window):
    # y : refence to calculate the mean/std
    # x : evaluate this based on men/std(y)
    # window = rolling window size
    # alpha = +- alpha * std
    roll_mean = y.rolling(window).mean()[window:]
    roll_std = y.rolling(window).std()[window:]
    gamma = np.zeros(len(x[window:]))
    gamma[x[window:] > roll_mean + roll_std * alpha] = 1
    gamma[x[window:] < roll_mean - roll_std * alpha] = -1
      
    # 1 = above mean + alpha*std
    # -1 = below mean - alpha*std
    # 0 = between mean +- alpha*std
    gamma = gamma.astype(int)
    return list(gamma)

In [309]:
"""
import data
"""

target_players = ['LeBron James', 'Kevin Durant', 'Stephen Curry', 'Russell Westbrook', 'James Harden', 
          'Giannis Antetokounmpo', 'Anthony Davis', 'Jimmy Butler', 'Draymond Green', 'Chris Paul',
          'Klay Thompson', 'John Wall', 'Paul George', 'DeMarcus Cousins', 'Rudy Gobert', 'Kyle Lowry',
          'Paul Millsap', 'Blake Griffin', 'Damian Lillard', 'DeAndre Jordan', 'Kyrie Irving',
          'Al Horford', 'DeMar DeRozan', 'Kevin Love', 'Andre Drummond', 'Carmelo Anthony', 'LaMarcus Aldridge',
          'Kemba Walker', 'Eric Bledsoe', 'Dwight Howard', 'Eric Gordon', 'George Hill', 'Jeff Teague', 
          'Andrew Wiggins', 'Serge Ibaka', 'Avery Bradley', 'Trevor Ariza', 'Devin Booker', 'Bradley Beal',
          'Karl-Anthony Towns', 'Marc Gasol', 'Khris Middleton']


# edit column names to fit with the yearly data
game_metrics = ['playPTS', 'playAST', 'playTO','playFG%','playFT%','play3PM','playTRB','playSTL', 'playBLK']
year_metrics = ['PTS_G','AST_G','TOV_G','TRB_G','STL_G','BLK_G','3P_G','FG%','FT%']
colname_dict = {'playPTS': 'PTS_G', 'playAST': 'AST_G', 'playTO':'TOV_G',
                'playFG%': 'FG%','playFT%':'FT%','play3PM':'3P_G',
                'playTRB':'TRB_G','playSTL':'STL_G','playBLK':'BLK_G'}

print("*** importing data ***")
data = pd.read_csv("../data/nba-enhanced-stats/2012-18_playerBoxScore.csv")

print("*** updating column names ***")
data = data.rename(columns=colname_dict)

print("*** updating dates ***")
date_col = pd.to_datetime(data.gmDate + " " + data.gmTime, format='%Y-%m-%d %H:%M').rename("date")
data = pd.concat([date_col,data], axis=1)

print("*** sorting columns ***")
stats_game = data[["date","gmDate","playDispNm"]+year_metrics]
stats_game = stats_game.rename(columns={"playDispNm": "Player"})

print("*** adding team info ***")
# obtain team info
df = pd.read_pickle("../data/nba-hosoi/nba_scores_2103-2018.pkl")
df = df[["nbaId","path","game_date","home","away","season"]].drop_duplicates().reset_index(drop=True)

a = pd.concat([df,(df["game_date"] + str(" ") + df["home"]).rename("key")], axis=1)
b = pd.concat([df,(df["game_date"] + str(" ") + df["away"]).rename("key")], axis=1)
appended = pd.concat([a,b], axis=0)

new_data = pd.concat([data, (data["gmDate"] + str(" ") + data["teamAbbr"]).rename("key")], axis=1)
data_fin = new_data.merge(appended, how='left', left_on='key', right_on='key')

stats_game = data_fin[["date","gmDate","gmTime","nbaId","teamLoc","playDispNm"]+year_metrics]
stats_game = stats_game.rename(columns={"playDispNm": "Player"})

print("*** completed ***")
stats_game.head()

*** importing data ***
*** updating column names ***
*** updating dates ***
*** sorting columns ***
*** adding team info ***
*** completed ***


Unnamed: 0,date,gmDate,gmTime,nbaId,teamLoc,Player,PTS_G,AST_G,TOV_G,TRB_G,STL_G,BLK_G,3P_G,FG%,FT%
0,2012-10-30 19:00:00,2012-10-30,19:00,,Away,A.J. Price,7,6,1,2,0,0,2,0.1538,1.0
1,2012-10-30 19:00:00,2012-10-30,19:00,,Away,Trevor Ariza,9,4,0,3,3,2,2,0.375,0.5
2,2012-10-30 19:00:00,2012-10-30,19:00,,Away,Emeka Okafor,10,0,1,7,0,4,0,0.4,0.5
3,2012-10-30 19:00:00,2012-10-30,19:00,,Away,Bradley Beal,8,3,2,3,1,0,2,0.25,1.0
4,2012-10-30 19:00:00,2012-10-30,19:00,,Away,Trevor Booker,4,1,4,1,1,1,0,0.2222,0.0


# For a player, For a metric:

In [362]:
player = "Stephen Curry"
player = "LeBron James"

metric = "PTS_G"

# train/test and validation split (percentage)
train = 0.4
val = 0.6

# wndow, alpha to define gamma(labels)
window = 50
alpha = 0.5

In [363]:
# select player's data
stats_player = stats_game[(stats_game.Player == player_name)]

# obtain train/val/test dates
total_rows = stats_player.shape[0]
train_date = stats_player.iloc[int(total_rows*train)]['gmDate']
val_date = stats_player.iloc[int(total_rows*val)]['gmDate']

stats_train = stats_player[(stats_player.date < train_date)]
stats_val = stats_player[(stats_player.date > train_date) & (stats_player.date < val_date)]
stats_test = stats_player[(stats_player.date > val_date)]

print("* Train/Val/Test split result")
print(stats_train.shape[0], "games in train set")
print(stats_val.shape[0], "games in validation set")
print(stats_test.shape[0], "games in test set")
print()

* Train/Val/Test split result
178 games in train set
89 games in validation set
178 games in test set



In [364]:
def create_label(stats, metric, alpha, window):
    label = np.array(get_gamma(stats[metric],stats[metric],alpha,window))
    print("* label shape: ", label.shape)
    return label

def create_features(stats, metric, alpha, window):
    # previous history (in window)
    f = stats[metric][window-1:-1]
    for i in range(1,window):
        f = np.vstack((f, stats[metric][window-1-i:-1-i]))
    # rolling mean
    f1 = stats[metric].rolling(window).mean()[window:].values
    # rolling std
    f2 = stats[metric].rolling(window).std()[window:].values
    # home/away, home=True=1
    f3 = stats.teamLoc[window-1:-1].values
    f3 = (f3 == 'Home')
    features = np.vstack((f, f1, f2, f3)).T

    print("* features shape: ", features.shape)
    # just to double check
    print("* features preview")
    print(np.vstack((f[:,:5], f1[:5], f2[:5], f3[:5])))
    return features

In [366]:
print("Train")
label_train = create_label(stats_train, metric, alpha, window)
features_train = create_features(stats_train, metric, alpha, window)
print()
print("Validation")
label_val = create_label(stats_val, metric, alpha, window)
features_val = create_features(stats_val, metric, alpha, window)
print()
print("Test")
label_test = create_label(stats_test, metric, alpha, window)
features_test = create_features(stats_test, metric, alpha, window)

Train
* label shape:  (128,)
* features shape:  (128, 53)
* features preview
[[20.         19.         18.         38.         54.        ]
 [29.         20.         19.         18.         38.        ]
 [27.         29.         20.         19.         18.        ]
 [18.         27.         29.         20.         19.        ]
 [32.         18.         27.         29.         20.        ]
 [14.         32.         18.         27.         29.        ]
 [ 7.         14.         32.         18.         27.        ]
 [29.          7.         14.         32.         18.        ]
 [17.         29.          7.         14.         32.        ]
 [26.         17.         29.          7.         14.        ]
 [21.         26.         17.         29.          7.        ]
 [31.         21.         26.         17.         29.        ]
 [28.         31.         21.         26.         17.        ]
 [20.         28.         31.         21.         26.        ]
 [29.         20.         28.         31.

# Model fitting

In [367]:
def fit(clf, features_train, label_train, features_val, label_val, features_test, label_test):
    clf.fit(features_train, label_train)
    print("Train accuracy     : ", clf.score(features_train, label_train))
    print("Validation accuracy: ", clf.score(features_val, label_val))
    print("Test accuracy      : ", clf.score(features_test, label_test))
    print()
    print("Train pred")
    print(clf.predict(features_train))
    print("Val pred")
    print(clf.predict(features_val))
    print("Test pred")
    print(clf.predict(features_test))
    print()

In [368]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB(alpha=0.1, binarize=0)
fit(clf, features_train, label_train, features_val, label_val, features_test, label_test)

Train accuracy     :  0.390625
Validation accuracy:  0.38461538461538464
Test accuracy      :  0.421875

Train pred
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Val pred
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
Test pred
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]



In [369]:
# Nearest Neighbors
from sklearn import neighbors

n_neighbors = 10
for weights in ['uniform', 'distance']:
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    print("*", weights)
    fit(clf, features_train, label_train, features_val, label_val, features_test, label_test)

* uniform
Train accuracy     :  0.4453125
Validation accuracy:  0.3333333333333333
Test accuracy      :  0.34375

Train pred
[ 0  0  0  0  0 -1  0 -1  1  0 -1  1  0  0  0  0  0  0  0  1  1 -1  0  0
 -1 -1  1  0  0  1  1  0  0  0 -1  0 -1  0  1 -1  0  1  0  0  0  0  1  1
  0  1  0  1  1  1  0 -1 -1  1 -1 -1 -1 -1 -1  0  1  0  0  0 -1  1  0  0
  0  0  0 -1  0  0 -1  1  0  0 -1  1 -1 -1  1  0 -1  0  0  0  1  0  1  0
 -1  0  0  0  1  0  0 -1  0  0  0 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  0  1  0
  1  0  0  0  0  0  0  0]
Val pred
[ 0  1 -1 -1  1  1  1  1  0  0  1  0 -1  0 -1  0  1  0  1 -1 -1  1  0 -1
  0  0  1  1 -1  0  0  0 -1  0 -1 -1 -1  0  0]
Test pred
[ 1  0 -1  1 -1 -1  0 -1  0 -1 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1 -1  0  1
  0  0 -1 -1 -1  0 -1  0 -1  0  0  1  1  0  0  1  1  0 -1 -1  1 -1  0  0
  0  1  1 -1 -1 -1 -1  1 -1 -1  1 -1 -1 -1  0 -1  0  0  0  0  0  0 -1  1
  1  0  0  0  0  0  0  0 -1  0 -1 -1  1  1 -1 -1  1 -1  0 -1  0  0  0  0
 -1  1  0  0 -1 -1  1  1  0  0 -1 -1 -1 -1 -1 -1  0  

In [370]:
from sklearn.svm import SVC
clf = SVC(C=1, kernel='rbf', gamma='auto',decision_function_shape='ovo', max_iter = 1000)
fit(clf, features_train, label_train, features_val, label_val, features_test, label_test)

Train accuracy     :  1.0
Validation accuracy:  0.38461538461538464
Test accuracy      :  0.421875

Train pred
[ 0  0  1  1  0  1  0 -1  0 -1  0  1 -1  1  1  0  1  0 -1  1  0 -1  0  0
  0  1  1 -1 -1  1  0 -1 -1  0  0  0 -1  0  0 -1  1  1  1  0  0  0  1  1
 -1  0  0  0 -1 -1 -1 -1  1 -1  1  0 -1 -1  1 -1  0  1  0  0  1  1  0  0
  1 -1  1  0  0  0 -1  0 -1 -1 -1  0  1 -1 -1 -1 -1 -1 -1  0  1  0  1  0
  1  1  0 -1 -1  1  0  1  1  1  0  1  0  0  1  0 -1 -1 -1  1 -1 -1  1  0
  0 -1  0 -1  0  0  0  1]
Val pred
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
Test pred
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]



'0.21.3'