In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")


# This notebook explores trying to predict the total score with LASSO instead of clustering. We found that this did not work well at all as the most important parameter was overtime which you cannot know ahead of the game. If we did not include OT it could only predict the score with 7% training accuracy 

In [47]:
nba = pd.read_csv('nbafull.csv')
nba.head()

Unnamed: 0,date,start_time,vis_team,home_team,vis_points,home_points,ot,attendance,home_initial,vis_initial,...,hp1,hp2,hp3,hp4,hp5,vis_city,home_city,home_state,home_temp,travel_miles
0,2019-10-22,20:00:00,New Orleans Pelicans,Toronto Raptors,122,130,1,20787,TOR,NOP,...,Kyle Lowry,Fred VanVleet,Pascal Siakam,OG Anunoby,Marc Gasol,New Orleans,Toronto,ON,57.1,1113.2
1,2019-10-22,22:30:00,Los Angeles Lakers,Los Angeles Clippers,102,112,0,19068,LAC,LAL,...,Kawhi Leonard,Patrick Beverley,Landry Shamet,Patrick Patterson,Ivica Zubac,Los Angeles,Los Angeles,CA,76.54,0.0
2,2019-10-23,19:00:00,Chicago Bulls,Charlotte Hornets,125,126,0,15424,CHO,CHI,...,P.J. Washington,Cody Zeller,Dwayne Bacon,Miles Bridges,Terry Rozier,Chicago,Charlotte,NC,57.13,587.1
3,2019-10-23,19:00:00,Detroit Pistons,Indiana Pacers,119,110,0,17923,IND,DET,...,Myles Turner,Malcolm Brogdon,Domantas Sabonis,T.J. Warren,Jeremy Lamb,Detroit,Indianapolis,IA,42.54,488.3
4,2019-10-23,19:00:00,Cleveland Cavaliers,Orlando Magic,85,94,0,18846,ORL,CLE,...,Nikola Vučević,Aaron Gordon,Evan Fournier,D.J. Augustin,Jonathan Isaac,Cleveland,Orlando,FL,76.45,893.8


In [48]:
nba.columns

Index(['date', 'start_time', 'vis_team', 'home_team', 'vis_points',
       'home_points', 'ot', 'attendance', 'home_initial', 'vis_initial',
       'home_players', 'home_bench', 'vis_players', 'vis_bench', 'referees',
       'score_starter_home', 'score_bench_home', 'score_starter_vis',
       'score_bench_vis', 'recovery_time', 'perform', 'def_perf_home',
       'off_perf_home', 'def_perf_vis', 'off_perf_vis', 'total_score', 'class',
       'frac_season', 'day_hour', 'ref1', 'ref2', 'ref3', 'vp1', 'vp2', 'vp3',
       'vp4', 'vp5', 'hp1', 'hp2', 'hp3', 'hp4', 'hp5', 'vis_city',
       'home_city', 'home_state', 'home_temp', 'travel_miles'],
      dtype='object')

In [49]:
nba = pd.concat([nba,nba['referees'].str.get_dummies(sep=',')],axis=1)
#nba = pd.concat([nba,nba['home_players'].str.get_dummies(sep=',')],axis=1)
#nba = pd.concat([nba,nba['vis_players'].str.get_dummies(sep=',')],axis=1)


# I tried including all of the one hot encodes but it just overwhelmed LASSO and I got test scores that were way bad so I dropped them for the time being

In [50]:
nba = nba.drop(columns = nba.columns[nba.dtypes=='object'].tolist())
nba

Unnamed: 0,vis_points,home_points,ot,attendance,score_starter_home,score_bench_home,score_starter_vis,score_bench_vis,recovery_time,perform,...,Terance Mann,Tom Washington,Tony Brothers,Tony Brown,Tre Maddox,Tyler Cook,Tyler Ford,Yuta Watanabe,Zach Norvell,Zach Zarba
0,122,130,1,20787,79.329620,54.083466,80.353980,55.202308,,,...,0,0,0,1,0,0,0,0,0,0
1,102,112,0,19068,57.011597,60.408231,76.038714,34.944992,,,...,0,0,0,0,0,0,0,0,0,0
2,125,126,0,15424,60.023165,60.341657,62.076264,58.108285,,,...,0,0,0,0,0,0,0,0,0,0
3,119,110,0,17923,77.901700,49.441942,57.526521,76.588732,,,...,0,0,0,0,0,0,0,0,0,0
4,85,94,0,18846,75.108187,48.639881,73.763983,28.930314,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,131,107,0,18064,60.028557,59.373173,81.079804,76.106040,5.0,28.000000,...,0,0,1,0,0,0,0,0,0,0
967,106,124,0,20172,78.115632,44.975505,42.101748,41.904619,7.0,25.000000,...,0,0,0,0,0,0,0,0,0,0
968,136,131,1,15393,81.600373,46.674671,55.688031,52.342454,3.0,17.000000,...,0,0,0,0,0,0,0,0,0,0
969,109,98,0,19600,50.473936,57.937381,59.412512,35.107792,5.0,17.666667,...,0,0,0,0,0,0,0,0,0,0


In [51]:
nba = nba.fillna(0)

In [52]:
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Split the data in to test and train. Now just focused on total score

In [53]:
from sklearn.model_selection import train_test_split
#y = nba[['vis_points','home_points']]
y = nba.total_score.copy()
#y = nba.vis_points
X = nba.drop(columns = ['vis_points','home_points','total_score','ot','class'])

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 609)

In [54]:
def mse(y,y_pred,r):
    return np.round(np.sum((y-y_pred)**2)/len(y),r)

In [55]:
#my lasso pipeline 
alpha = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
n=10

# These will hold our coefficient estimates
lasso_coefs = np.empty((len(alpha),n))

# for each alpha value
for i in range(len(alpha)):
    
    # set up the lasso pipeline
    lasso_pipe = Pipeline([('scale',StandardScaler()),
                          ('lasso',Lasso(alpha = alpha[i], normalize=False, max_iter = 1e5))])

    
    # fit the lasso
    lasso_pipe.fit(X_train,y_train)
    
    # record the coefficients
    
    train_score=lasso_pipe.score(X_train,y_train)
    test_score=lasso_pipe.score(X_test,y_test)
    coeff_used = np.sum(lasso_pipe['lasso'].coef_!=0)
    print('alpha = ',alpha[i])
    print("training score:", train_score )
    print("test score: ", test_score)
    print("number of features used: ", coeff_used)
    print("number of features eliminated: ", np.sum(lasso_pipe['lasso'].coef_==0)-coeff_used)
    print("The training MSE is",mse(y_train,lasso_pipe.predict(X_train),4))
    print("The testing MSE is",mse(y_test,lasso_pipe.predict(X_test),4))
    print(X_train.columns[lasso_pipe['lasso'].coef_!=0])
    print()
    print()

    #sns.set(font_scale=1.2) 

    #sns.barplot(x=lasso_pipe['lasso'].coef_, y=X_train.columns,)

    #plt.title("Coefficients in the Lasso Model")
    #plt.show()

alpha =  1e-05
training score: 0.19407747242101447
test score:  -0.053172366550761074
number of features used:  119
number of features eliminated:  -113
The training MSE is 324.9683
The testing MSE is 448.5298
Index(['attendance', 'score_starter_home', 'score_bench_home',
       'score_starter_vis', 'score_bench_vis', 'recovery_time', 'perform',
       'def_perf_home', 'off_perf_home', 'def_perf_vis',
       ...
       'Talen Horton-Tucker', 'Terance Mann', 'Tom Washington',
       'Tony Brothers', 'Tony Brown', 'Tre Maddox', 'Tyler Cook', 'Tyler Ford',
       'Yuta Watanabe', 'Zach Zarba'],
      dtype='object', length=119)


alpha =  0.0001
training score: 0.19407296503867322
test score:  -0.053130626454519714
number of features used:  119
number of features eliminated:  -113
The training MSE is 324.9701
The testing MSE is 448.512
Index(['attendance', 'score_starter_home', 'score_bench_home',
       'score_starter_vis', 'score_bench_vis', 'recovery_time', 'perform',
       'def_perf_

# alpha =1 seems simplest and explains the data pretty well. Alpha =0.1 also good but only increases the scores by a few percent. 

In [56]:
lasso_pipe = Pipeline([('scale',StandardScaler()),
                      ('lasso',Lasso(alpha = 1, normalize=False, max_iter = 1e5))])


# fit the lasso
lasso_pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=1, copy_X=True, fit_intercept=True,
                       max_iter=100000.0, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [None]:
nbaslice = nba[X.columns[lasso_pipe['lasso'].coef_ !=0].tolist()]
nbaslice['total_score'] = nba.total_score.copy()
nbaslice

In [None]:
sns.pairplot(nbaslice)

In [None]:
y_train_pred = lasso_pipe.predict(X_train)
print("The training MSE is",mse(y_train,y_train_pred,4))

In [28]:
print("The testing MSE is",mse(y_test,lasso_pipe.predict(X_test),4))

The testing MSE is 420.1842
