In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics as sm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import SCORERS
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import KFold


In [4]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [5]:
#function to test accuracy
def print_error(regressor, X_train, y_train, X_test, y_test):
  train_pred = regressor.predict(X_train)
  test_pred = regressor.predict(X_test)
  print("train rmse error: ", sm.mean_squared_error(y_train,train_pred, squared=False))
  print("train r2 error: ", sm.r2_score(y_train,train_pred))
  print("test rmse error: ", sm.mean_squared_error(y_test,test_pred, squared=False))
  print("test r2 error: ", sm.r2_score(y_test,test_pred))




In [6]:
#load dataset
X_train = np.loadtxt('/content/drive/My Drive/nba-project-data/train/X.csv',delimiter=',', skiprows=1)
y_train = np.loadtxt('/content/drive/My Drive/nba-project-data/train/y.csv',delimiter=',', skiprows=1)
X_test = np.loadtxt('/content/drive/My Drive/nba-project-data/test/X.csv',delimiter=',', skiprows=1)
y_test = np.loadtxt('/content/drive/My Drive/nba-project-data/test/y.csv',delimiter=',', skiprows=1)
X_final = np.loadtxt('/content/drive/My Drive/nba-project-data/final/X.csv',delimiter=',', skiprows=1)
y_final = np.loadtxt('/content/drive/My Drive/nba-project-data/final/y.csv',delimiter=',', skiprows=1)

Model 1: Linear Regression

In [7]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)
print(linear_regressor.coef_)
print_error(linear_regressor, X_train, y_train, X_test,y_test)
print(linear_regressor.score(X_test, y_test))



[-7.49769711e-02 -6.25875888e-02 -6.66417107e-02 -6.65994166e-02
 -6.75333076e-02 -9.04974074e-02 -3.59187891e-02  1.72968941e-02
  5.81409308e-03 -1.56774678e+00  3.68069095e+00 -4.88025296e+00
 -2.66007886e+00 -2.91305526e+00  8.41676337e-02 -7.71428122e-01
 -3.02478031e+00  4.56071665e-01  1.09334659e+01  1.67699058e-02
 -4.43244159e-01 -2.38483410e-01 -2.73936940e+00 -2.00826139e+00
  2.86161357e+00  7.28478867e-01 -1.39456305e+00  1.03570460e+00
  5.49139419e-01 -1.51243656e+00  9.81760780e-01 -2.08858831e-01
 -5.93753635e+00 -6.91297187e-01  7.21020183e-01 -9.11340860e-02
 -1.48311238e-01  1.83855116e-01  2.71768014e-01 -1.10186394e-01
  5.23961139e-03  1.23907180e-01  2.11597052e+00  2.07722430e+00
 -1.56275742e+00  9.02645340e+00 -3.82854343e+00 -3.68952472e+00
  3.83866473e+00 -6.08577959e-01  5.77997637e-02  4.92078910e-02
  7.25104145e-01 -1.20782114e-01]
train rmse error:  4.86482180052702
train r2 error:  0.6108929437069264
test rmse error:  4.8980813171311395
test r2 erro

Model 2: XGBoost

Part 1: Hyperopt for Hyperparameter Tuning

In [8]:

#set up input space of xgboost hyperparameters
space={ 'n_estimators': 1000,
        'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),       
        }
#rmse-scorer:
def rmse(estimator, X, y):
  y_pred = estimator.predict(X)
  return sm.mean_squared_error(y, y_pred, squared=False)

#set up objective function
def hp_tuning(space):
  XGB_regressor = xgb.XGBRegressor( n_estimators =space['n_estimators'], 
                                    max_depth = int(space['max_depth']), 
                                    gamma = space['gamma'],
                                    reg_alpha = int(space['reg_alpha']),
                                    min_child_weight=space['min_child_weight'],
                                    colsample_bytree=space['colsample_bytree'],
                                    objective='reg:squarederror,
                                    learning_rate)
  
  cv_score = np.mean(cross_val_score(XGB_regressor, X_train, y_train, cv=10, scoring=rmse))
  return {'loss':cv_score, 'status': STATUS_OK}

trials=Trials()

#perform tuning
best = fmin(fn=hp_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100, 
            trials=trials)
print(best)
  

100%|██████████| 100/100 [1:01:24<00:00, 36.84s/it, best loss: 4.095423337665675]
{'colsample_bytree': 0.9569379116779869, 'gamma': 5.141912030539592, 'max_depth': 5.0, 'min_child_weight': 1.0, 'reg_alpha': 103.0, 'reg_lambda': 0.6429136368345092}


In [22]:
model = xgb.XGBRegressor( n_estimators =1000, 
                                    max_depth = int(best['max_depth']), 
                                    gamma = best['gamma'],
                                    reg_alpha = int(best['reg_alpha']),
                                    min_child_weight=best['min_child_weight'],
                                    colsample_bytree=best['colsample_bytree'],
                                    objective='reg:squarederror')
model.fit(X_train, y_train)
df = pd.read_csv('/content/drive/My Drive/nba-project-data/train/X.csv',delimiter=',')
imp = model.feature_importances_
columns = df.columns
imp, columns = zip(*sorted(zip(imp, columns),reverse=True))
for i,_ in enumerate(columns):
  print('{}: {}'.format(columns[i],imp[i]))

PTS: 0.2736038267612457
FG: 0.21478089690208435
MP: 0.1534663438796997
YearsPro: 0.06920067965984344
DRB: 0.04899637773633003
FGA: 0.04192399978637695
2P: 0.024326307699084282
TOV: 0.018723685294389725
FTA: 0.01759047619998455
2PA: 0.014656401239335537
TRB: 0.011863085441291332
PER: 0.007199614308774471
Age: 0.005950266495347023
G: 0.00509476475417614
GS: 0.004207727499306202
Height: 0.0041333953849971294
PF: 0.003961632028222084
DWS: 0.003889830783009529
STL%: 0.003620442468672991
PG%: 0.0035661086440086365
BLK%: 0.0033804248087108135
C%: 0.003286132588982582
AST: 0.003215471049770713
OBPM: 0.0032105809077620506
WS: 0.0027355875354260206
PF%: 0.002666639629751444
SF%: 0.0026212921366095543
Weight: 0.0026195866521447897
eFG%: 0.0024647237733006477
USG%: 0.0023909476585686207
DRB%: 0.002351227216422558
3P%: 0.0023233452811837196
STL: 0.002297047758474946
Season: 0.00221514655277133
WS/48: 0.0021301249507814646
DBPM: 0.0021258462220430374
FTr: 0.002041521715000272
BPM: 0.0020169578492641