In [1]:
import pandas as pd
from scipy import stats
import math
import numpy as np
import warnings
import pickle

from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, cross_val_predict, train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../sc_stats.csv')
df.head()

Unnamed: 0,game_season,game_result,game_location,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,...,orb,drb,trb,ast,stl,blk,pf,tov,pts,plus_minus
0,1,L (-1),home,35:39,7,12,0.583,0,1,0.0,...,1,1,2,7,4,0,2,2,14,7
1,2,L (-22),away,39:05,5,9,0.556,2,3,0.667,...,0,2,2,4,1,0,5,3,12,-19
2,3,W (+8),home,28:27,3,6,0.5,1,2,0.5,...,0,5,5,9,2,0,4,1,7,-4
3,4,L (-28),home,21:32,1,5,0.2,0,1,0.0,...,0,1,1,3,0,0,6,0,5,-13
4,5,L (-13),away,31:15,4,8,0.5,1,2,0.5,...,1,3,4,6,0,0,4,5,9,-5


In [3]:
def convert_game_result_to_int(result):
    first_paran_idx = result.index('(')
    second_paran_idx = result.index(')')
    return int(result[first_paran_idx + 1: second_paran_idx])

In [4]:
def convert_game_result_to_wl(result):
    return result[0]

In [5]:
df['game_res_int'] = df['game_result'].apply(lambda row: convert_game_result_to_int(row))
df['game_res_wl'] = df['game_result'].apply(lambda row: convert_game_result_to_wl(row))

In [6]:
def convert_mp(mp):
    minutes, sec = mp.split(':')
    return int(minutes) + (int(sec) / 60)

In [7]:
df['tot_time'] = df['mp'].apply(lambda row: convert_mp(row))

In [8]:
df.head()

Unnamed: 0,game_season,game_result,game_location,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,...,ast,stl,blk,pf,tov,pts,plus_minus,game_res_int,game_res_wl,tot_time
0,1,L (-1),home,35:39,7,12,0.583,0,1,0.0,...,7,4,0,2,2,14,7,-1,L,35.65
1,2,L (-22),away,39:05,5,9,0.556,2,3,0.667,...,4,1,0,5,3,12,-19,-22,L,39.083333
2,3,W (+8),home,28:27,3,6,0.5,1,2,0.5,...,9,2,0,4,1,7,-4,8,W,28.45
3,4,L (-28),home,21:32,1,5,0.2,0,1,0.0,...,3,0,0,6,0,5,-13,-28,L,21.533333
4,5,L (-13),away,31:15,4,8,0.5,1,2,0.5,...,6,0,0,4,5,9,-5,-13,L,31.25


In [9]:
df.drop(['mp', 'game_result'], axis=1, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 24 columns):
game_season      699 non-null int64
game_location    699 non-null object
fg               699 non-null int64
fga              699 non-null int64
fg_pct           697 non-null float64
fg3              699 non-null int64
fg3a             699 non-null int64
fg3_pct          695 non-null float64
ft               699 non-null int64
fta              699 non-null int64
ft_pct           603 non-null float64
orb              699 non-null int64
drb              699 non-null int64
trb              699 non-null int64
ast              699 non-null int64
stl              699 non-null int64
blk              699 non-null int64
pf               699 non-null int64
tov              699 non-null int64
pts              699 non-null int64
plus_minus       699 non-null int64
game_res_int     699 non-null int64
game_res_wl      699 non-null object
tot_time         699 non-null float64
dtypes: float64(4), i

In [11]:
df.isna().sum(axis=0)

game_season       0
game_location     0
fg                0
fga               0
fg_pct            2
fg3               0
fg3a              0
fg3_pct           4
ft                0
fta               0
ft_pct           96
orb               0
drb               0
trb               0
ast               0
stl               0
blk               0
pf                0
tov               0
pts               0
plus_minus        0
game_res_int      0
game_res_wl       0
tot_time          0
dtype: int64

In [12]:
imputer = SimpleImputer(strategy='constant', fill_value=0)

In [13]:
df = pd.DataFrame(data=imputer.fit_transform(df), columns=df.columns)

In [14]:
df.isna().sum(axis=0)

game_season      0
game_location    0
fg               0
fga              0
fg_pct           0
fg3              0
fg3a             0
fg3_pct          0
ft               0
fta              0
ft_pct           0
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
pf               0
tov              0
pts              0
plus_minus       0
game_res_int     0
game_res_wl      0
tot_time         0
dtype: int64

In [15]:
df.head()

Unnamed: 0,game_season,game_location,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,...,ast,stl,blk,pf,tov,pts,plus_minus,game_res_int,game_res_wl,tot_time
0,1,home,7,12,0.583,0,1,0.0,0,0,...,7,4,0,2,2,14,7,-1,L,35.65
1,2,away,5,9,0.556,2,3,0.667,0,0,...,4,1,0,5,3,12,-19,-22,L,39.0833
2,3,home,3,6,0.5,1,2,0.5,0,0,...,9,2,0,4,1,7,-4,8,W,28.45
3,4,home,1,5,0.2,0,1,0.0,3,4,...,3,0,0,6,0,5,-13,-28,L,21.5333
4,5,away,4,8,0.5,1,2,0.5,0,2,...,6,0,0,4,5,9,-5,-13,L,31.25


In [16]:
df_X = df.loc[:, 'game_location': 'tot_time']
df_X.drop(['plus_minus', 'game_res_wl', 'game_location'], axis=1, inplace=True)

df_y = df['plus_minus']

In [17]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,pf,tov,pts,game_res_int,tot_time
0,7,12,0.583,0,1,0.0,0,0,0.0,1,1,2,7,4,0,2,2,14,-1,35.65
1,5,9,0.556,2,3,0.667,0,0,0.0,0,2,2,4,1,0,5,3,12,-22,39.0833
2,3,6,0.5,1,2,0.5,0,0,0.0,0,5,5,9,2,0,4,1,7,8,28.45
3,1,5,0.2,0,1,0.0,3,4,0.75,0,1,1,3,0,0,6,0,5,-28,21.5333
4,4,8,0.5,1,2,0.5,0,2,0.0,1,3,4,6,0,0,4,5,9,-13,31.25


In [18]:
nums = df_X.columns.tolist()

In [19]:
nums

['fg',
 'fga',
 'fg_pct',
 'fg3',
 'fg3a',
 'fg3_pct',
 'ft',
 'fta',
 'ft_pct',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'pf',
 'tov',
 'pts',
 'game_res_int',
 'tot_time']

# Data Science

### Numericals

In [20]:
ALPHA = 0.05

In [21]:
high_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

mid_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

high_corr_cutoff = 0.8
mid_corr_cutoff = 0.5

for i in range(len(nums) - 1):
    j = i + 1
    while j < len(nums):
        correl_signif = stats.pearsonr(df_X[nums[i]], df_X[nums[j]])
        
        if abs(correl_signif[0]) > high_corr_cutoff:
            high_info['feature1'].append(nums[i])
            high_info['feature2'].append(nums[j])
            high_info['r'].append(correl_signif[0])
        
        elif abs(correl_signif[0]) < high_corr_cutoff and abs(correl_signif[0]) > mid_corr_cutoff:
            mid_info['feature1'].append(nums[i])
            mid_info['feature2'].append(nums[j])
            mid_info['r'].append(correl_signif[0])
        
        j += 1
        
high_info_df = pd.DataFrame(data=high_info)
print(f'High Correlations:\n\n{high_info_df}')

print('\n')

mid_info_df = pd.DataFrame(data=mid_info)
print(f'Decent Correlations:\n\n{mid_info_df}')

High Correlations:

  feature1 feature2         r
0       fg      pts  0.944255
1      fg3      pts  0.808183
2       ft      fta  0.979005
3      drb      trb  0.931680


Decent Correlations:

   feature1  feature2         r
0        fg       fga  0.799839
1        fg    fg_pct  0.674855
2        fg       fg3  0.738251
3        fg      fg3a  0.625653
4       fga       fg3  0.569560
5       fga      fg3a  0.745016
6       fga       pts  0.766746
7       fga  tot_time  0.586639
8    fg_pct   fg3_pct  0.645504
9    fg_pct       pts  0.621922
10      fg3      fg3a  0.788780
11      fg3   fg3_pct  0.640131
12     fg3a       pts  0.695964
13       ft    ft_pct  0.561573
14       ft       pts  0.528208
15      fta       pts  0.527419


In [22]:
df_X.drop(['trb', 'ft'], axis=1, inplace=True)
nums.remove('trb')
nums.remove('ft')

In [23]:
CORR_CUTOFF = 0.35

In [24]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

top_corrs = []

for col in nums:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('plus minus')
    label_corr_info['r'].append(correl_signif[0])
    
    if abs(correl_signif[0]) > CORR_CUTOFF:
        top_corrs.append(col)
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

        feature1    feature2         r
0             fg  plus minus  0.222720
1            fga  plus minus  0.054447
2         fg_pct  plus minus  0.318048
3            fg3  plus minus  0.298575
4           fg3a  plus minus  0.229519
5        fg3_pct  plus minus  0.232160
6            fta  plus minus  0.156827
7         ft_pct  plus minus  0.165189
8            orb  plus minus  0.005484
9            drb  plus minus  0.191611
10           ast  plus minus  0.242444
11           stl  plus minus  0.162664
12           blk  plus minus  0.046458
13            pf  plus minus -0.124485
14           tov  plus minus -0.133273
15           pts  plus minus  0.272230
16  game_res_int  plus minus  0.856637
17      tot_time  plus minus -0.083537


In [25]:
top_corrs

['game_res_int']

# Feature Engineering

### New Columns

In [26]:
# Statistical formulas through research.
df_X['efficiency'] = df_X['pts'] + (df_X['orb'] + df_X['drb']) + df_X['ast'] + df_X['stl'] + df_X['blk'] - (df_X['fga'] - df_X['fg']) - (df_X['fta'] - (df_X['fta'] * df_X['ft_pct'])) - df_X['tov']

df_X['efg'] = (df_X['fg'] + (0.5 * df_X['fg3'])) / df_X['fga'].replace(0, np.inf)
df_X['tov%'] = 100 * df_X['tov'] / (df_X['fga'] + 0.44 * df_X['fta'] + df_X['tov']).replace(0, np.inf)
df_X['true_shooting%'] = df_X['pts'] / (2 * (df_X['fga'] + 0.44 * df_X['fta'])).replace(0, np.inf)
df_X['ppp'] = df_X['pts'] / (df_X['fga'] + 0.44 * df_X['fta'] + df_X['tov']).replace(0, np.inf)
df_X['gmsc'] = df_X['pts'] + 0.4 * df_X['fg'] - 0.7 * df_X['fga'] - 0.4 * (df_X['fta'] - (df_X['fta'] * df_X['ft_pct'])) + 0.7 * df_X['orb'] + 0.3 * df_X['drb'] + df_X['stl'] + 0.7 * df_X['ast'] + 0.7 * df_X['blk'] - 0.4 * df_X['pf'] - df_X['tov']
df_X['per'] = ( (df_X['fg'] * 85.910) + (df_X['stl'] * 53.897) + (df_X['fg3'] * 51.757) + ((df_X['fta'] * df_X['ft_pct']) * 46.845) + (df_X['blk'] * 39.190) + (df_X['orb'] * 39.190) + (df_X['ast'] * 34.677) + (df_X['drb'] * 14.707) - (df_X['pf'] * 17.174) - ( (df_X['fta'] - (df_X['fta'] * df_X['ft_pct']) ) * 20.091) - ( (df_X['fga'] - df_X['fg'] ) * 39.190) - (df_X['tov'] * 53.897) ) * (1 / df_X['tot_time'])
df_X['possessions'] = .96 * (df_X['fga'] - df_X['orb'] + (.44 * df_X['fta'])) + df_X['tov']
df_X['off_rating'] = df_X['pts'] * 100 / df_X['possessions'].replace(0, np.inf)

df_X['fg_part'] = df_X['fg'] * (1 - 0.5 * ((df_X['pts'] - (df_X['fta'] * df_X['ft_pct'])) / (2 * df_X['fga'].replace(0, np.inf))) * df_X['ast'])

df_X['stl_tov'] = df_X['stl'].div(df_X['tov'].replace(0, np.inf))
df_X['stops/tov'] = (df_X['stl'] + df_X['blk']).div(df_X['tov'].replace(0, np.inf))
df_X['3/pts'] = df_X['fg3'].div(df_X['fg'].replace(0, np.inf))
df_X['3a'] = df_X['fg3a'].div(df_X['fga'].replace(0, np.inf))
df_X['pts_per_min'] = (df_X['pts'] / df['tot_time'])
df_X['net_pos_responsible_for'] = df_X['fg'] + df_X['ast'] + df_X['stl'] + df_X['blk'] - df['tov'] # possessions responsible for
df_X['ft+fg3'] = (df_X['fta'] * df_X['ft_pct']) + df_X['fg3']
df_X['ast_ratio'] = df_X['ast'] / df_X['ast'].mean()

df_X['gmsc_per_pt'] = df_X['gmsc'].div(df['pts'].replace(0, np.inf))
df_X['gmsc_per_fg'] = df_X['gmsc'].div(df['fg'].replace(0, np.inf))
df_X['gmsc_per_fg_times_ppp'] = df_X['gmsc_per_fg'] * df_X['ppp']
df_X['gmsc_per_pt_times_ppp'] = df_X['gmsc_per_pt'] * df_X['ppp']
df_X['gmsc_per'] = df_X['gmsc'] * df_X['per']
df_X['gmsc_ppp_mul'] = df_X['gmsc'] * df_X['ppp']
df_X['gmsc_ppp_div'] = df_X['gmsc'].div(df_X['ppp'].replace(0, np.inf))
df_X['per_ppp'] = df_X['per'].div(df_X['ppp'].replace(0, np.inf))

df_X['per_per_pt'] = df_X['per'].div(df['pts'].replace(0, np.inf))
df_X['per_per_fg'] = df_X['per'].div(df['fg'].replace(0, np.inf))
df_X['per_per_fg_times_ppp'] = df_X['per_per_fg'] * df_X['ppp']
df_X['per_per_pt_times_ppp'] = df_X['per_per_pt'] * df_X['ppp']

df_X['game_result_per_pt'] = df_X['game_res_int'] / df_X['pts'].replace(0, np.inf)

In [27]:
new_cols = ['off_rating', 'efficiency', 'efg', 'tov%', 'true_shooting%', 'ppp', 'gmsc', 'per', 'fg_part', 'stl_tov', 'stops/tov', '3/pts', '3a', 'pts_per_min', 'net_pos_responsible_for', 'ft+fg3', 'ast_ratio', 'gmsc_per_pt', 'gmsc_per_fg', 'gmsc_per_fg_times_ppp', 'gmsc_per_pt_times_ppp', 'gmsc_per', 'gmsc_ppp_div', 'gmsc_ppp_mul', 'per_ppp', 'per_per_pt', 'per_per_fg', 'per_per_fg_times_ppp', 'per_per_pt_times_ppp', 'game_result_per_pt']

In [28]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

for col in new_cols:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('Plus Minus')
    label_corr_info['r'].append(correl_signif[0])
    
    if abs(correl_signif[0]) > CORR_CUTOFF:
        top_corrs.append(col)
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

                   feature1    feature2         r
0                off_rating  Plus Minus  0.415688
1                efficiency  Plus Minus  0.437317
2                       efg  Plus Minus  0.367933
3                      tov%  Plus Minus -0.185552
4            true_shooting%  Plus Minus  0.379701
5                       ppp  Plus Minus  0.412332
6                      gmsc  Plus Minus  0.423461
7                       per  Plus Minus  0.507728
8                   fg_part  Plus Minus -0.307792
9                   stl_tov  Plus Minus  0.166153
10                stops/tov  Plus Minus  0.173569
11                    3/pts  Plus Minus  0.278758
12                       3a  Plus Minus  0.320089
13              pts_per_min  Plus Minus  0.365820
14  net_pos_responsible_for  Plus Minus  0.377271
15                   ft+fg3  Plus Minus  0.285549
16                ast_ratio  Plus Minus  0.242444
17              gmsc_per_pt  Plus Minus  0.465097
18              gmsc_per_fg  Plus Minus  0.461679


In [29]:
top_corrs

['game_res_int',
 'off_rating',
 'efficiency',
 'efg',
 'true_shooting%',
 'ppp',
 'gmsc',
 'per',
 'pts_per_min',
 'net_pos_responsible_for',
 'gmsc_per_pt',
 'gmsc_per_fg',
 'gmsc_per_fg_times_ppp',
 'gmsc_per_pt_times_ppp',
 'gmsc_per',
 'gmsc_ppp_div',
 'gmsc_ppp_mul',
 'per_ppp',
 'per_per_fg',
 'per_per_fg_times_ppp',
 'per_per_pt_times_ppp',
 'game_result_per_pt']

In [30]:
cols_to_remove = []

for col in df_X.columns:
    if col not in top_corrs:
        cols_to_remove.append(col)

cols_to_remove

['fg',
 'fga',
 'fg_pct',
 'fg3',
 'fg3a',
 'fg3_pct',
 'fta',
 'ft_pct',
 'orb',
 'drb',
 'ast',
 'stl',
 'blk',
 'pf',
 'tov',
 'pts',
 'tot_time',
 'tov%',
 'possessions',
 'fg_part',
 'stl_tov',
 'stops/tov',
 '3/pts',
 '3a',
 'ft+fg3',
 'ast_ratio',
 'per_per_pt']

In [31]:
df_X.drop(cols_to_remove, axis=1, inplace=True)

In [32]:
df_X.head()

Unnamed: 0,game_res_int,efficiency,efg,true_shooting%,ppp,gmsc,per,off_rating,pts_per_min,net_pos_responsible_for,...,gmsc_per_fg_times_ppp,gmsc_per_pt_times_ppp,gmsc_per,gmsc_ppp_mul,gmsc_ppp_div,per_ppp,per_per_fg,per_per_fg_times_ppp,per_per_pt_times_ppp,game_result_per_pt
0,-1,20,0.583333,0.583333,1.0,15.5,21.7532,111.465,0.392707,16,...,2.21429,1.10714,337.175,15.5,15.5,21.7532,3.1076,3.1076,1.5538,-0.0714286
1,-22,12,0.666667,0.666667,1.0,7.1,8.97472,103.093,0.307036,7,...,1.42,0.591667,63.7205,7.1,7.1,8.97472,1.79494,1.79494,0.747893,-1.83333
2,8,19,0.583333,0.583333,1.0,11.2,19.7802,103.55,0.246046,13,...,3.73333,1.6,221.538,11.2,11.2,19.7802,6.59339,6.59339,2.82574,1.14286
3,-28,4,0.2,0.369822,0.739645,1.5,3.03195,77.0464,0.232198,4,...,1.10947,0.221893,4.54793,1.10947,2.028,4.0992,3.03195,2.24257,0.448513,-5.6
4,-13,8,0.5625,0.506757,0.648415,3.4,4.8527,71.6287,0.288,5,...,0.551153,0.244957,16.4992,2.20461,5.24356,7.48395,1.21318,0.786641,0.349618,-1.44444


In [33]:
cols = df_X.columns
df_X = StandardScaler().fit_transform(df_X)
df_X = pd.DataFrame(data=df_X, columns=cols)

In [34]:
df_X.head()

Unnamed: 0,game_res_int,efficiency,efg,true_shooting%,ppp,gmsc,per,off_rating,pts_per_min,net_pos_responsible_for,...,gmsc_per_fg_times_ppp,gmsc_per_pt_times_ppp,gmsc_per,gmsc_ppp_mul,gmsc_ppp_div,per_ppp,per_per_fg,per_per_fg_times_ppp,per_per_pt_times_ppp,game_result_per_pt
0,-0.436028,-0.386177,0.073803,-0.177265,-0.168749,-0.34579,-0.326595,-0.03021,-1.085387,0.477085,...,-0.226555,0.702772,-0.503713,-0.432503,-0.221902,-0.187599,-0.072128,-0.227355,0.383664,-0.265684
1,-1.86438,-1.13497,0.564942,0.335971,-0.168749,-1.26065,-1.276973,-0.290466,-1.408018,-1.289449,...,-0.877891,-0.684252,-1.022282,-1.001149,-1.62555,-1.703479,-0.959882,-0.831066,-0.606094,-2.123694
2,0.176124,-0.479776,0.073803,-0.177265,-0.168749,-0.814111,-0.473336,-0.276244,-1.637705,-0.11176,...,1.019107,2.028933,-0.723003,-0.723596,-0.940436,-0.421656,2.285322,1.375815,1.945768,1.014836
3,-2.272481,-1.883764,-2.185433,-1.492245,-1.041283,-1.870557,-1.718956,-1.10014,-1.689854,-1.878293,...,-1.132536,-1.679223,-1.134495,-1.406684,-2.473086,-2.281849,-0.123291,-0.625198,-0.973772,-6.095818
4,-1.252229,-1.509367,-0.048981,-0.648888,-1.347024,-1.663624,-1.583541,-1.268553,-1.479707,-1.682012,...,-1.59037,-1.617166,-1.111831,-1.332547,-1.935763,-1.880325,-1.353333,-1.294801,-1.095227,-1.713593


## Categorical 

In [35]:
def convert_game_season(gs):
    if gs < 27:
        return 'first'
    elif gs >= 27 and gs < 55:
        return 'second'
    else:
        return 'third'

df['gs_label_thirds'] = df['game_season'].apply(lambda row: convert_game_season(row))

In [36]:
avg_3pt_shooting = df['fg3_pct'].mean()
avg_fg_shooting = df['fg_pct'].mean()
avg_ft_shooting = df['ft_pct'].mean()
avg_pts = df['pts'].mean()
avg_drb = df['drb'].mean()
avg_orb = df['orb'].mean()
tot_time_avg = df['tot_time'].mean()

In [37]:
efficiency_avg = df_X['efficiency'].mean()
game_res_int_avg = df_X['game_res_int'].mean()
off_rating_avg = df_X['off_rating'].mean()

In [38]:
def convert_pct(cur, avg):
    if cur < avg:
        return 'below'
    else:
        return 'above'

In [39]:
df['3pct_label'] = df['fg3_pct'].apply(lambda row: convert_pct(row, avg_3pt_shooting))
df['fg_pct_label'] = df['fg_pct'].apply(lambda row: convert_pct(row, avg_fg_shooting))
df['ft_pct_label'] = df['ft_pct'].apply(lambda row: convert_pct(row, avg_ft_shooting))
df['pts_label'] = df['pts'].apply(lambda row: convert_pct(row, avg_pts))
df['drb_label'] = df['drb'].apply(lambda row: convert_pct(row, avg_drb))
df['orb_label'] = df['orb'].apply(lambda row: convert_pct(row, avg_orb))
df['tot_time_label'] = df['tot_time'].apply(lambda row: convert_pct(row, tot_time_avg))

In [40]:
df['efficiency_label'] = df_X['efficiency'].apply(lambda row: convert_pct(row, efficiency_avg))
df['game_res_label'] = df_X['game_res_int'].apply(lambda row: convert_pct(row, game_res_int_avg))
df['off_rating_label'] = df_X['off_rating'].apply(lambda row: convert_pct(row, game_res_int_avg))

In [41]:
foul_threshold = 4

def in_foul_trouble(pf):
    if pf < foul_threshold:
        return 'no'
    else:
        return 'yes'

In [42]:
df['pf_label'] = df['pf'].apply(lambda row: in_foul_trouble(row))

In [43]:
def is_blk_greater_than_stl(diff):
    if diff < 0:
        return 'stl'
    else:
        return 'blk'

In [44]:
df['blk_stl_diff'] = df['blk'] - df['stl']
df['blk_stl_diff_label'] = df['blk_stl_diff'].apply(lambda row: is_blk_greater_than_stl(row))

In [45]:
def is_3s_greater_than_2s(diff):
    if diff < 0:
        return '2s'
    else:
        return '3s'

In [46]:
def is_assist_greater_than_tov(diff):
    if diff < 0:
        return 'tov'
    else:
        return 'ass'

In [47]:
df['3s_than_2s'] = df['fg3'] - (df['fg'] - df['fg3'])
df['ast_than_tov'] = df['ast'] - df['tov']

In [48]:
df['more_3s_than_2s'] = df['3s_than_2s'].apply(lambda row: is_3s_greater_than_2s(row))
df['more_ast_than_tov'] = df['ast_than_tov'].apply(lambda row: is_assist_greater_than_tov(row))

In [49]:
df.drop(['blk_stl_diff', '3s_than_2s', 'ast_than_tov'], axis=1, inplace=True)

In [50]:
cats = ['off_rating_label', 'game_res_label', 'efficiency_label', 'tot_time_label', 'orb_label', 'drb_label', 'more_3s_than_2s', 'more_ast_than_tov', 'pf_label', 'blk_stl_diff_label', 'pts_label', 'game_res_wl', 'gs_label_thirds', '3pct_label', 'fg_pct_label', 'game_location', 'ft_pct_label']

In [51]:
# Categoricals

data = []
cols = ['Feature', 'F', 'P', 'Rejected?']
statistically_sign_cols = []

def anova_helper(df, cat_col, num_col='plus_minus'):
    uniques = df[cat_col].unique()
    data = []
    for col in uniques:
        cur_df = (df[cat_col] == col)
        cur_df = df[cur_df][num_col]
        data.append(cur_df.tolist())
        
    f, p = stats.f_oneway(*data)
    return f, p

ALPHA = 0.05

for feature in cats:
    f, p = anova_helper(df, feature)
    row = [feature, f, p, p < ALPHA]
    
    if p < ALPHA:
        statistically_sign_cols.append(feature)
        
    data.append(row)

cats_df = pd.DataFrame(data=data, columns=cols)
cats_df

Unnamed: 0,Feature,F,P,Rejected?
0,off_rating_label,105.720398,3.486034e-23,True
1,game_res_label,707.783428,3.574116e-108,True
2,efficiency_label,109.859629,5.71237e-24,True
3,tot_time_label,10.736022,0.001102766,True
4,orb_label,0.168578,0.6815053,False
5,drb_label,15.802213,7.765439e-05,True
6,more_3s_than_2s,28.184194,1.484461e-07,True
7,more_ast_than_tov,11.947589,0.0005803844,True
8,pf_label,15.518935,8.990416e-05,True
9,blk_stl_diff_label,12.458172,0.0004435211,True


In [52]:
statistically_sign_cols

['off_rating_label',
 'game_res_label',
 'efficiency_label',
 'tot_time_label',
 'drb_label',
 'more_3s_than_2s',
 'more_ast_than_tov',
 'pf_label',
 'blk_stl_diff_label',
 'pts_label',
 'game_res_wl',
 '3pct_label',
 'fg_pct_label',
 'game_location',
 'ft_pct_label']

In [53]:
for col in statistically_sign_cols:
    df_X[col] = df[col]

In [54]:
df_X.head()

Unnamed: 0,game_res_int,efficiency,efg,true_shooting%,ppp,gmsc,per,off_rating,pts_per_min,net_pos_responsible_for,...,more_3s_than_2s,more_ast_than_tov,pf_label,blk_stl_diff_label,pts_label,game_res_wl,3pct_label,fg_pct_label,game_location,ft_pct_label
0,-0.436028,-0.386177,0.073803,-0.177265,-0.168749,-0.34579,-0.326595,-0.03021,-1.085387,0.477085,...,2s,ass,no,stl,below,L,below,above,home,below
1,-1.86438,-1.13497,0.564942,0.335971,-0.168749,-1.26065,-1.276973,-0.290466,-1.408018,-1.289449,...,2s,ass,yes,stl,below,L,above,above,away,below
2,0.176124,-0.479776,0.073803,-0.177265,-0.168749,-0.814111,-0.473336,-0.276244,-1.637705,-0.11176,...,2s,ass,yes,stl,below,W,above,above,home,below
3,-2.272481,-1.883764,-2.185433,-1.492245,-1.041283,-1.870557,-1.718956,-1.10014,-1.689854,-1.878293,...,2s,ass,yes,blk,below,L,below,below,home,below
4,-1.252229,-1.509367,-0.048981,-0.648888,-1.347024,-1.663624,-1.583541,-1.268553,-1.479707,-1.682012,...,2s,ass,yes,blk,below,L,above,above,away,below


In [55]:
for cat in statistically_sign_cols:
    binarizer = LabelBinarizer()
    data = binarizer.fit_transform(df_X[cat].values)  
    df_X[cat] = data

In [56]:
df_X.head()

Unnamed: 0,game_res_int,efficiency,efg,true_shooting%,ppp,gmsc,per,off_rating,pts_per_min,net_pos_responsible_for,...,more_3s_than_2s,more_ast_than_tov,pf_label,blk_stl_diff_label,pts_label,game_res_wl,3pct_label,fg_pct_label,game_location,ft_pct_label
0,-0.436028,-0.386177,0.073803,-0.177265,-0.168749,-0.34579,-0.326595,-0.03021,-1.085387,0.477085,...,0,0,0,1,1,0,1,0,1,1
1,-1.86438,-1.13497,0.564942,0.335971,-0.168749,-1.26065,-1.276973,-0.290466,-1.408018,-1.289449,...,0,0,1,1,1,0,0,0,0,1
2,0.176124,-0.479776,0.073803,-0.177265,-0.168749,-0.814111,-0.473336,-0.276244,-1.637705,-0.11176,...,0,0,1,1,1,1,0,0,1,1
3,-2.272481,-1.883764,-2.185433,-1.492245,-1.041283,-1.870557,-1.718956,-1.10014,-1.689854,-1.878293,...,0,0,1,0,1,0,1,1,1,1
4,-1.252229,-1.509367,-0.048981,-0.648888,-1.347024,-1.663624,-1.583541,-1.268553,-1.479707,-1.682012,...,0,0,1,0,1,0,0,0,0,1


In [57]:
len(df_X.columns)

37

In [58]:
# pca = PCA(0.9)
pca = PCA(0.975)
# pca = PCA(0.985)

df_X = pca.fit_transform(df_X)
df_X = pd.DataFrame(data=df_X)

df_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.667065,0.715196,-0.134622,1.388408,-1.099351,1.783871,-0.144413,-0.29386,0.107811,0.134038,-0.263697,0.427196,0.407305,0.497486,0.306391,-0.081485,0.176336,0.290532
1,4.293968,-1.233713,2.690682,1.322541,-1.412997,0.569174,-0.219411,-0.495396,-0.262668,-0.09535,0.460635,-0.370666,0.189691,-0.009096,-0.550789,0.234122,0.567009,0.117863
2,-1.331891,5.191428,0.42077,2.311292,-1.358544,1.396007,0.151759,-0.135343,0.273006,1.112752,0.667156,-0.473497,-0.119627,-0.343205,0.045872,-0.100892,0.986336,0.319495
3,7.720479,-1.390285,2.678365,3.544642,1.435588,-0.399677,1.836904,-0.643924,-0.779904,0.800422,-1.080528,-0.509603,-0.797124,-0.714595,-1.198053,1.531666,0.00689,0.698071
4,6.537953,-1.121569,2.114602,-0.008603,-0.399982,0.33102,-0.039187,-0.886425,-0.487523,0.182439,0.356083,-0.311613,-0.703735,-0.394634,0.010905,-0.254101,0.607332,-0.108208


In [59]:
len(df_X.columns)

18

# Testing

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, train_size=0.8)

In [61]:
def scores_info(scores):
    print(scores.mean())
    print(scores.std())
    print(scores)

## LinearRegression

In [62]:
lin_reg = LinearRegression()

In [63]:
X_train.head(), y_train.head()

(           0         1         2         3         4         5         6   \
 622 -4.178851 -0.849784  0.165681 -0.661415  0.305587 -0.143981 -0.007591   
 452 -2.030899 -2.366652 -1.516803  0.459574  0.464596  0.101901 -0.805926   
 185  1.737670 -0.026990  0.503955  2.695238  0.933846  0.162966 -0.057327   
 26   4.666798 -0.762128  0.029846  0.908413 -0.370455  0.332941 -0.018804   
 644  2.705686  1.028233  0.123210 -1.499685 -0.334775 -1.118476 -0.200882   
 
            7         8         9         10        11        12        13  \
 622 -0.273047  0.272428  0.659395 -0.269660  0.005574  0.083728 -0.414489   
 452  0.060705 -0.410950 -0.720793 -0.017056  0.408142  0.264263 -0.213075   
 185 -0.054574 -0.436669 -0.429203 -0.138724  0.222889  0.347287  0.221756   
 26   0.150483 -0.171391  0.634034  0.559748  0.438482 -0.412183  0.818693   
 644  0.673989  0.372805  0.347680 -0.360234  0.131548  0.578903  0.460497   
 
            14        15        16        17  
 622 -0.14681

In [64]:
scores = cross_val_score(lin_reg, df_X, df_y, cv=5)
scores_info(scores)

0.6870658883185097
0.02762419537511146
[0.65755442 0.73912036 0.67296547 0.68330611 0.68238308]


## SVR

In [121]:
svr = SVR()

In [122]:
param_grid = [
    {
        'C': [0.1, 0.5, 1, 2.5, 5],
        'kernel': ['poly', 'rbf', 'sigmoid', 'linear'],
        'degree': [3, 6, 9],
        'gamma': ['auto', 'scale'], 
        'epsilon': [0, 0.1, 0.5]
    }
]

In [123]:
fe_gscv_svr = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', refit='neg_mean_squared_error')

In [124]:
fe_gscv_svr.fit(df_X, df_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5], 'degree': [3, 6, 9],
                          'epsilon': [0, 0.1, 0.5], 'gamma': ['auto', 'scale'],
                          'kernel': ['poly', 'rbf', 'sigmoid', 'linear']}],
             pre_dispatch='2*n_jobs', refit='neg_mean_squared_error',
             return_train_score=False, scoring='neg_mean_squared_error',
             verbose=0)

In [125]:
fe_gscv_svr.best_params_

{'C': 0.5, 'degree': 3, 'epsilon': 0.5, 'gamma': 'auto', 'kernel': 'linear'}

In [126]:
fe_gscv_svr.best_score_

-52.0353478220923

In [116]:
svr.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [117]:
svr.score(X_train, y_train)

0.6518042534040429

In [118]:
preds = svr.predict(X_test)

In [119]:
mse = mean_squared_error(preds, y_test)

In [120]:
mse, math.sqrt(mse)

(62.82562475229219, 7.926261713588077)

### Random Forest Regressor

In [65]:
rfr = RandomForestRegressor(random_state=42)

In [66]:
param_grid = [
    {
        'n_estimators': [100, 300, 500, 750],
        'max_features': ['sqrt', 'log2'], 
        'min_impurity_split': [0.0, 1e-7]
    }
]

In [67]:
fe_gscv_rfr = GridSearchCV(rfr, param_grid, cv=5, scoring='neg_mean_squared_error', refit='neg_mean_squared_error')

In [68]:
fe_gscv_rfr.fit(df_X, df_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': ['sqrt', 'log2']

In [69]:
fe_gscv_rfr.best_params_

{'max_features': 'sqrt', 'min_impurity_split': 0.0, 'n_estimators': 750}

In [70]:
fe_gscv_rfr.best_score_

-73.48095891146082

In [71]:
best_rfr = fe_gscv_rfr.best_estimator_

In [72]:
best_rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=0.0,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=750,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [73]:
best_rfr.score(X_train, y_train)

0.9490024748870269

In [74]:
preds = best_rfr.predict(X_test)

In [75]:
mse = mean_squared_error(preds, y_test)

In [76]:
mse, math.sqrt(mse)

(60.318557714285724, 7.766502283157182)

In [77]:
scores = cross_val_score(rfr, df_X, df_y, cv=5)
print(scores, scores.mean(), scores.std())

[0.56021109 0.60655156 0.54802227 0.50963446 0.56400443] 0.5576847605312087 0.031101383129026567


### SGD

In [78]:
sgd = SGDRegressor(random_state=42)

In [79]:
param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'epsilon': [0.0, 0.1, 0.5], 
        'learning_rate': ['constant', 'optimal'],
        'eta0': [0.001, 0.01, 0.025, 0.05]
    }
]

In [80]:
fe_gscv_sgd = GridSearchCV(sgd, param_grid, cv=5, scoring='neg_mean_squared_error', refit='neg_mean_squared_error')

In [81]:
fe_gscv_sgd.fit(df_X, df_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SGDRegressor(alpha=0.0001, average=False,
                                    early_stopping=False, epsilon=0.1,
                                    eta0=0.01, fit_intercept=True,
                                    l1_ratio=0.15, learning_rate='invscaling',
                                    loss='squared_loss', max_iter=1000,
                                    n_iter_no_change=5, penalty='l2',
                                    power_t=0.25, random_state=42, shuffle=True,
                                    tol=0.001, validation_fraction=0.1,
                                    verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'epsilon': [0.0, 0.1, 0.5],
                          'eta0': [0.001, 0.01, 0.025, 0.05],
                          'learning_rate': ['constant', 'optimal'],
                          'penalty': ['l1', 'l2']}],
             pre_dispatch='2*n

In [82]:
fe_gscv_sgd.best_params_

{'epsilon': 0.0, 'eta0': 0.001, 'learning_rate': 'constant', 'penalty': 'l2'}

In [83]:
fe_gscv_sgd.best_score_

-51.569160815191935

In [84]:
best_sgd = fe_gscv_sgd.best_estimator_

In [85]:
best_sgd.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.0,
             eta0=0.001, fit_intercept=True, l1_ratio=0.15,
             learning_rate='constant', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=42,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [86]:
best_sgd.score(X_train, y_train)

0.7440068000416169

In [87]:
preds = best_sgd.predict(X_test)

In [88]:
mse = mean_squared_error(preds, y_test)

In [89]:
mse, math.sqrt(mse)

(45.39568127345923, 6.737631725870688)

In [90]:
scores = cross_val_score(best_sgd, df_X, df_y, cv=5)
scores_info(scores)

0.6870928074484404
0.03063837858599899
[0.65231767 0.74387982 0.67177034 0.68267007 0.68482613]


### Lasso Regression

In [91]:
lasso = Lasso(random_state=42)

In [92]:
scores = cross_val_score(lasso, df_X, df_y, cv=5)

In [93]:
preds = cross_val_predict(lasso, df_X, df_y, cv=5)
# preds

In [94]:
mean_squared_error(preds, df_y)

57.13895985325339

In [95]:
scores_info(scores)

0.654061452122326
0.02611694768435285
[0.61574509 0.69317878 0.64107647 0.66935486 0.65095206]


In [96]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=42,
      selection='cyclic', tol=0.0001, warm_start=False)

In [97]:
lasso.score(X_train, y_train)

0.7008768163221868

In [98]:
preds = lasso.predict(X_test)

In [99]:
mse = mean_squared_error(preds, y_test)

In [100]:
mse, math.sqrt(mse)

(43.536534903344474, 6.598222101698644)

### Ridge Regression

In [101]:
ridge = Ridge(random_state=42)

scores = cross_val_score(ridge, df_X, df_y, cv=5)
preds = cross_val_predict(ridge, df_X, df_y, cv=5)

print(mean_squared_error(preds, df_y))
scores_info(scores)

51.565709236926025
0.687252921714847
0.02782530049361371
[0.65729494 0.73957091 0.67299078 0.68364442 0.68276355]


In [102]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=42, solver='auto', tol=0.001)

In [103]:
ridge.score(X_train, y_train)

0.7449854154025022

In [104]:
preds = ridge.predict(X_test)

In [105]:
mse = mean_squared_error(preds, y_test)

In [106]:
mse, math.sqrt(mse)

(47.62625909776594, 6.9011780949172685)

### ElasticNet Regression

In [107]:
elasticnet = ElasticNet(random_state=42)

scores = cross_val_score(elasticnet, df_X, df_y, cv=5)
preds = cross_val_predict(elasticnet, df_X, df_y, cv=5)

print(mean_squared_error(preds, df_y))
scores_info(scores)

62.5973177900125
0.6207935875902721
0.028778387958515863
[0.57782294 0.66747222 0.61316869 0.62774187 0.61776222]


In [108]:
elasticnet.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

In [109]:
elasticnet.score(X_train, y_train)

0.6776949691901313

In [110]:
preds = elasticnet.predict(X_test)

In [111]:
mse = mean_squared_error(preds, y_test)

In [112]:
mse, math.sqrt(mse)

(46.20363748637013, 6.797325759912506)