In [1]:
import pandas as pd
from scipy import stats
import math
import numpy as np
import warnings
import pickle

from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, cross_val_predict
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../sc_stats.csv')
df.head()

Unnamed: 0,game_season,game_result,game_location,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,...,orb,drb,trb,ast,stl,blk,pf,tov,pts,plus_minus
0,1,L (-1),home,35:39,7,12,0.583,0,1,0.0,...,1,1,2,7,4,0,2,2,14,7
1,2,L (-22),away,39:05,5,9,0.556,2,3,0.667,...,0,2,2,4,1,0,5,3,12,-19
2,3,W (+8),home,28:27,3,6,0.5,1,2,0.5,...,0,5,5,9,2,0,4,1,7,-4
3,4,L (-28),home,21:32,1,5,0.2,0,1,0.0,...,0,1,1,3,0,0,6,0,5,-13
4,5,L (-13),away,31:15,4,8,0.5,1,2,0.5,...,1,3,4,6,0,0,4,5,9,-5


In [3]:
def convert_game_result_to_int(result):
    first_paran_idx = result.index('(')
    second_paran_idx = result.index(')')
    return int(result[first_paran_idx + 1: second_paran_idx])

In [4]:
def convert_game_result_to_wl(result):
    return result[0]

In [5]:
df['game_res_int'] = df['game_result'].apply(lambda row: convert_game_result_to_int(row))
df['game_res_wl'] = df['game_result'].apply(lambda row: convert_game_result_to_wl(row))

In [6]:
def convert_mp(mp):
    minutes, sec = mp.split(':')
    return int(minutes) + (int(sec) / 60)

In [7]:
df['tot_time'] = df['mp'].apply(lambda row: convert_mp(row))

In [8]:
df.head()

Unnamed: 0,game_season,game_result,game_location,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,...,ast,stl,blk,pf,tov,pts,plus_minus,game_res_int,game_res_wl,tot_time
0,1,L (-1),home,35:39,7,12,0.583,0,1,0.0,...,7,4,0,2,2,14,7,-1,L,35.65
1,2,L (-22),away,39:05,5,9,0.556,2,3,0.667,...,4,1,0,5,3,12,-19,-22,L,39.083333
2,3,W (+8),home,28:27,3,6,0.5,1,2,0.5,...,9,2,0,4,1,7,-4,8,W,28.45
3,4,L (-28),home,21:32,1,5,0.2,0,1,0.0,...,3,0,0,6,0,5,-13,-28,L,21.533333
4,5,L (-13),away,31:15,4,8,0.5,1,2,0.5,...,6,0,0,4,5,9,-5,-13,L,31.25


In [9]:
df.drop(['mp', 'game_result'], axis=1, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 24 columns):
game_season      699 non-null int64
game_location    699 non-null object
fg               699 non-null int64
fga              699 non-null int64
fg_pct           697 non-null float64
fg3              699 non-null int64
fg3a             699 non-null int64
fg3_pct          695 non-null float64
ft               699 non-null int64
fta              699 non-null int64
ft_pct           603 non-null float64
orb              699 non-null int64
drb              699 non-null int64
trb              699 non-null int64
ast              699 non-null int64
stl              699 non-null int64
blk              699 non-null int64
pf               699 non-null int64
tov              699 non-null int64
pts              699 non-null int64
plus_minus       699 non-null int64
game_res_int     699 non-null int64
game_res_wl      699 non-null object
tot_time         699 non-null float64
dtypes: float64(4), i

In [11]:
df.isna().sum(axis=0)

game_season       0
game_location     0
fg                0
fga               0
fg_pct            2
fg3               0
fg3a              0
fg3_pct           4
ft                0
fta               0
ft_pct           96
orb               0
drb               0
trb               0
ast               0
stl               0
blk               0
pf                0
tov               0
pts               0
plus_minus        0
game_res_int      0
game_res_wl       0
tot_time          0
dtype: int64

In [12]:
imputer = SimpleImputer(strategy='constant', fill_value=0)

In [13]:
df = pd.DataFrame(data=imputer.fit_transform(df), columns=df.columns)

In [14]:
df.isna().sum(axis=0)

game_season      0
game_location    0
fg               0
fga              0
fg_pct           0
fg3              0
fg3a             0
fg3_pct          0
ft               0
fta              0
ft_pct           0
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
pf               0
tov              0
pts              0
plus_minus       0
game_res_int     0
game_res_wl      0
tot_time         0
dtype: int64

In [15]:
df.head()

Unnamed: 0,game_season,game_location,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,...,ast,stl,blk,pf,tov,pts,plus_minus,game_res_int,game_res_wl,tot_time
0,1,home,7,12,0.583,0,1,0.0,0,0,...,7,4,0,2,2,14,7,-1,L,35.65
1,2,away,5,9,0.556,2,3,0.667,0,0,...,4,1,0,5,3,12,-19,-22,L,39.0833
2,3,home,3,6,0.5,1,2,0.5,0,0,...,9,2,0,4,1,7,-4,8,W,28.45
3,4,home,1,5,0.2,0,1,0.0,3,4,...,3,0,0,6,0,5,-13,-28,L,21.5333
4,5,away,4,8,0.5,1,2,0.5,0,2,...,6,0,0,4,5,9,-5,-13,L,31.25


In [16]:
df_X = df.loc[:, 'game_location': 'tot_time']
df_X.drop(['plus_minus', 'game_res_wl', 'game_location'], axis=1, inplace=True)

df_y = df['plus_minus']

In [17]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,pf,tov,pts,game_res_int,tot_time
0,7,12,0.583,0,1,0.0,0,0,0.0,1,1,2,7,4,0,2,2,14,-1,35.65
1,5,9,0.556,2,3,0.667,0,0,0.0,0,2,2,4,1,0,5,3,12,-22,39.0833
2,3,6,0.5,1,2,0.5,0,0,0.0,0,5,5,9,2,0,4,1,7,8,28.45
3,1,5,0.2,0,1,0.0,3,4,0.75,0,1,1,3,0,0,6,0,5,-28,21.5333
4,4,8,0.5,1,2,0.5,0,2,0.0,1,3,4,6,0,0,4,5,9,-13,31.25


In [18]:
nums = df_X.columns.tolist()

In [19]:
nums

['fg',
 'fga',
 'fg_pct',
 'fg3',
 'fg3a',
 'fg3_pct',
 'ft',
 'fta',
 'ft_pct',
 'orb',
 'drb',
 'trb',
 'ast',
 'stl',
 'blk',
 'pf',
 'tov',
 'pts',
 'game_res_int',
 'tot_time']

# Data Science

### Numericals

In [20]:
ALPHA = 0.05

In [21]:
high_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

mid_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

high_corr_cutoff = 0.8
mid_corr_cutoff = 0.5

for i in range(len(nums) - 1):
    j = i + 1
    while j < len(nums):
        correl_signif = stats.pearsonr(df_X[nums[i]], df_X[nums[j]])
        
        if abs(correl_signif[0]) > high_corr_cutoff:
            high_info['feature1'].append(nums[i])
            high_info['feature2'].append(nums[j])
            high_info['r'].append(correl_signif[0])
        
        elif abs(correl_signif[0]) < high_corr_cutoff and abs(correl_signif[0]) > mid_corr_cutoff:
            mid_info['feature1'].append(nums[i])
            mid_info['feature2'].append(nums[j])
            mid_info['r'].append(correl_signif[0])
        
        j += 1
        
high_info_df = pd.DataFrame(data=high_info)
print(f'High Correlations:\n\n{high_info_df}')

print('\n')

mid_info_df = pd.DataFrame(data=mid_info)
print(f'Decent Correlations:\n\n{mid_info_df}')

High Correlations:

  feature1 feature2         r
0       fg      pts  0.944255
1      fg3      pts  0.808183
2       ft      fta  0.979005
3      drb      trb  0.931680


Decent Correlations:

   feature1  feature2         r
0        fg       fga  0.799839
1        fg    fg_pct  0.674855
2        fg       fg3  0.738251
3        fg      fg3a  0.625653
4       fga       fg3  0.569560
5       fga      fg3a  0.745016
6       fga       pts  0.766746
7       fga  tot_time  0.586639
8    fg_pct   fg3_pct  0.645504
9    fg_pct       pts  0.621922
10      fg3      fg3a  0.788780
11      fg3   fg3_pct  0.640131
12     fg3a       pts  0.695964
13       ft    ft_pct  0.561573
14       ft       pts  0.528208
15      fta       pts  0.527419


In [22]:
CORR_CUTOFF = 0.2

In [23]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

top_corrs = []

for col in nums:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('plus minus')
    label_corr_info['r'].append(correl_signif[0])
    
    if abs(correl_signif[0]) > CORR_CUTOFF:
        top_corrs.append(col)
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

        feature1    feature2         r
0             fg  plus minus  0.222720
1            fga  plus minus  0.054447
2         fg_pct  plus minus  0.318048
3            fg3  plus minus  0.298575
4           fg3a  plus minus  0.229519
5        fg3_pct  plus minus  0.232160
6             ft  plus minus  0.158737
7            fta  plus minus  0.156827
8         ft_pct  plus minus  0.165189
9            orb  plus minus  0.005484
10           drb  plus minus  0.191611
11           trb  plus minus  0.178597
12           ast  plus minus  0.242444
13           stl  plus minus  0.162664
14           blk  plus minus  0.046458
15            pf  plus minus -0.124485
16           tov  plus minus -0.133273
17           pts  plus minus  0.272230
18  game_res_int  plus minus  0.856637
19      tot_time  plus minus -0.083537


In [24]:
top_corrs

['fg', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ast', 'pts', 'game_res_int']

# Feature Engineering

### New Columns

In [25]:
# Statistical formulas through research.
df_X['efficiency'] = df_X['pts'] + df_X['trb'] + df_X['ast'] + df_X['stl'] + df_X['blk'] - (df_X['fga'] - df_X['fg']) - (df_X['fta'] - df_X['ft']) - df_X['tov']
df_X['efg'] = (df_X['fg'] + (0.5 * df_X['fg3'])) / df_X['fga'].replace(0, np.inf)
df_X['tov%'] = 100 * df_X['tov'] / (df_X['fga'] + 0.44 * df_X['fta'] + df_X['tov']).replace(0, np.inf)
df_X['true_shooting%'] = df_X['pts'] / (2 * (df_X['fga'] + 0.44 * df_X['fta'])).replace(0, np.inf)
df_X['ppp'] = df_X['pts'] / (df_X['fga'] + 0.44 * df_X['fta'] + df_X['tov']).replace(0, np.inf)
df_X['gmsc'] = df_X['pts'] + 0.4 * df_X['fg'] - 0.7 * df_X['fga'] - 0.4 * (df_X['fta'] - df_X['ft']) + 0.7 * df_X['orb'] + 0.3 * df_X['drb'] + df_X['stl'] + 0.7 * df_X['ast'] + 0.7 * df_X['blk'] - 0.4 * df_X['pf'] - df_X['tov']
df_X['per'] = ( (df_X['fg'] * 85.910) + (df_X['stl'] * 53.897) + (df_X['fg3'] * 51.757) + (df_X['ft'] * 46.845) + (df_X['blk'] * 39.190) + (df_X['orb'] * 39.190) + (df_X['ast'] * 34.677) + (df_X['drb'] * 14.707) - (df_X['pf'] * 17.174) - ( (df_X['fta'] - df_X['ft'] ) * 20.091) - ( (df_X['fga'] - df_X['fg'] ) * 39.190) - (df_X['tov'] * 53.897) ) * (1 / df_X['tot_time'])

df_X['fg_part'] = df_X['fg'] * (1 - 0.5 * ((df_X['pts'] - df_X['ft']) / (2 * df_X['fga'].replace(0, np.inf))) * df_X['ast'])

df_X['stl_tov'] = df_X['stl'].div(df_X['tov'].replace(0, np.inf))
df_X['stops/tov'] = (df_X['stl'] + df_X['blk']).div(df_X['tov'].replace(0, np.inf))
df_X['3/pts'] = df_X['fg3'].div(df_X['fg'].replace(0, np.inf))
df_X['3a'] = df_X['fg3a'].div(df_X['fga'].replace(0, np.inf))
df_X['pts_per_min'] = (df_X['pts'] / df['tot_time']) * 60
df_X['net_pos_responsible_for'] = df_X['fg'] + df_X['ast'] + df_X['stl'] + df_X['blk'] - df['tov'] # possessions responsible for
df_X['ft+fg3'] = df_X['ft'] + df_X['fg3']
df_X['ast_ratio'] = df_X['ast'] / df_X['ast'].mean()

df_X['gmsc_per_pt'] = df_X['gmsc'].div(df['pts'].replace(0, np.inf))
df_X['gmsc_per_fg'] = df_X['gmsc'].div(df['fg'].replace(0, np.inf))
df_X['gmsc_per_fg_times_ppp'] = df_X['gmsc_per_fg'] * df_X['ppp']
df_X['gmsc_per_pt_times_ppp'] = df_X['gmsc_per_pt'] * df_X['ppp']
df_X['gmsc_per'] = df_X['gmsc'] * df_X['per']
df_X['gmsc_ppp_mul'] = df_X['gmsc'] * df_X['ppp']
df_X['gmsc_ppp_div'] = df_X['gmsc'].div(df_X['ppp'].replace(0, np.inf))
df_X['per_ppp'] = df_X['per'].div(df_X['ppp'].replace(0, np.inf))

df_X['per_per_pt'] = df_X['per'].div(df['pts'].replace(0, np.inf))
df_X['per_per_fg'] = df_X['per'].div(df['fg'].replace(0, np.inf))
df_X['per_per_fg_times_ppp'] = df_X['per_per_fg'] * df_X['ppp']
df_X['per_per_pt_times_ppp'] = df_X['per_per_pt'] * df_X['ppp']

df_X['game_result_per_pt'] = df_X['game_res_int'] / df_X['pts'].replace(0, np.inf)

In [26]:
new_cols = ['efficiency', 'efg', 'tov%', 'true_shooting%', 'ppp', 'gmsc', 'per', 'fg_part', 'stl_tov', 'stops/tov', '3/pts', '3a', 'pts_per_min', 'net_pos_responsible_for', 'ft+fg3', 'ast_ratio', 'gmsc_per_pt', 'gmsc_per_fg', 'gmsc_per_fg_times_ppp', 'gmsc_per_pt_times_ppp', 'gmsc_per', 'gmsc_ppp_div', 'gmsc_ppp_mul', 'per_ppp', 'per_per_pt', 'per_per_fg', 'per_per_fg_times_ppp', 'per_per_pt_times_ppp', 'game_result_per_pt']

In [27]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

for col in new_cols:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('Plus Minus')
    label_corr_info['r'].append(correl_signif[0])
    
    if abs(correl_signif[0]) > CORR_CUTOFF:
        top_corrs.append(col)
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

                   feature1    feature2         r
0                efficiency  Plus Minus  0.437312
1                       efg  Plus Minus  0.367933
2                      tov%  Plus Minus -0.185552
3            true_shooting%  Plus Minus  0.379701
4                       ppp  Plus Minus  0.412332
5                      gmsc  Plus Minus  0.423458
6                       per  Plus Minus  0.507722
7                   fg_part  Plus Minus -0.307796
8                   stl_tov  Plus Minus  0.166153
9                 stops/tov  Plus Minus  0.173569
10                    3/pts  Plus Minus  0.278758
11                       3a  Plus Minus  0.320089
12              pts_per_min  Plus Minus  0.365820
13  net_pos_responsible_for  Plus Minus  0.377271
14                   ft+fg3  Plus Minus  0.285535
15                ast_ratio  Plus Minus  0.242444
16              gmsc_per_pt  Plus Minus  0.465094
17              gmsc_per_fg  Plus Minus  0.461675
18    gmsc_per_fg_times_ppp  Plus Minus  0.498642


In [28]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,...,gmsc_per_pt_times_ppp,gmsc_per,gmsc_ppp_mul,gmsc_ppp_div,per_ppp,per_per_pt,per_per_fg,per_per_fg_times_ppp,per_per_pt_times_ppp,game_result_per_pt
0,7,12,0.583,0,1,0.0,0,0,0.0,1,...,1.10714,337.175,15.5,15.5,21.7532,1.5538,3.1076,3.1076,1.5538,-0.0714286
1,5,9,0.556,2,3,0.667,0,0,0.0,0,...,0.591667,63.7205,7.1,7.1,8.97472,0.747893,1.79494,1.79494,0.747893,-1.83333
2,3,6,0.5,1,2,0.5,0,0,0.0,0,...,1.6,221.538,11.2,11.2,19.7802,2.82574,6.59339,6.59339,2.82574,1.14286
3,1,5,0.2,0,1,0.0,3,4,0.75,0,...,0.221893,4.54793,1.10947,2.028,4.0992,0.60639,3.03195,2.24257,0.448513,-5.6
4,4,8,0.5,1,2,0.5,0,2,0.0,1,...,0.244957,16.4992,2.20461,5.24356,7.48395,0.539189,1.21318,0.786641,0.349618,-1.44444


In [29]:
cols = df_X.columns
df_X = StandardScaler().fit_transform(df_X)
df_X = pd.DataFrame(data=df_X, columns=cols)

In [30]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,...,gmsc_per_pt_times_ppp,gmsc_per,gmsc_ppp_mul,gmsc_ppp_div,per_ppp,per_per_pt,per_per_fg,per_per_fg_times_ppp,per_per_pt_times_ppp,game_result_per_pt
0,-0.341324,-0.985658,0.872014,-1.52063,-1.920755,-2.148487,-1.282483,-1.329787,-2.187826,0.344512,...,0.702769,-0.503719,-0.432505,-0.221905,-0.18761,0.709977,-0.072135,-0.22736,0.383659,-0.265684
1,-0.946102,-1.569041,0.664808,-0.66859,-1.387974,1.2606,-1.282483,-1.329787,-2.187826,-0.807708,...,-0.684254,-1.022292,-1.001151,-1.625553,-1.703504,-0.59408,-0.959891,-0.831071,-0.606098,-2.123694
2,-1.550881,-2.152424,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-1.329787,-2.187826,-0.807708,...,2.02893,-0.723011,-0.723597,-0.940439,-0.421669,2.768128,2.28532,1.37581,1.945763,1.014836
3,-2.15566,-2.346885,-2.067237,-1.52063,-1.920755,-2.148487,-0.231949,-0.014582,-0.064429,-0.807708,...,-1.679225,-1.134506,-1.406686,-2.473089,-2.28188,-0.823049,-0.123299,-0.625203,-0.973775,-6.095818
4,-1.248492,-1.763502,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-0.672184,-2.187826,0.344512,...,-1.617167,-1.111842,-1.332549,-1.935767,-1.880352,-0.931788,-1.353344,-1.294805,-1.095231,-1.713593


## Categorical 

In [31]:
def convert_game_season(gs):
    if gs < 27:
        return 'first'
    elif gs >= 27 and gs < 55:
        return 'second'
    else:
        return 'third'

df['gs_label_thirds'] = df['game_season'].apply(lambda row: convert_game_season(row))

In [32]:
avg_3pt_shooting = df['fg3_pct'].mean()
avg_fg_shooting = df['fg_pct'].mean()
avg_ft_shooting = df['ft_pct'].mean()
avg_pts = df['pts'].mean()
avg_drb = df['drb'].mean()
avg_orb = df['orb'].mean()
tot_time_avg = df['tot_time'].mean()

In [33]:
def convert_pct(cur, avg):
    if cur < avg:
        return 'below'
    else:
        return 'above'

In [34]:
df['3pct_label'] = df['fg3_pct'].apply(lambda row: convert_pct(row, avg_3pt_shooting))
df['fg_pct_label'] = df['fg_pct'].apply(lambda row: convert_pct(row, avg_fg_shooting))
df['ft_pct_label'] = df['ft_pct'].apply(lambda row: convert_pct(row, avg_ft_shooting))
df['pts_label'] = df['pts'].apply(lambda row: convert_pct(row, avg_pts))
df['drb_label'] = df['drb'].apply(lambda row: convert_pct(row, avg_drb))
df['orb_label'] = df['orb'].apply(lambda row: convert_pct(row, avg_orb))
df['tot_time_label'] = df['tot_time'].apply(lambda row: convert_pct(row, tot_time_avg))

In [35]:
foul_threshold = 4

def in_foul_trouble(pf):
    if pf < foul_threshold:
        return 'no'
    else:
        return 'yes'

In [36]:
df['pf_label'] = df['pf'].apply(lambda row: in_foul_trouble(row))

In [37]:
def is_blk_greater_than_stl(diff):
    if diff < 0:
        return 'stl'
    else:
        return 'blk'

In [38]:
df['blk_stl_diff'] = df['blk'] - df['stl']
df['blk_stl_diff_label'] = df['blk_stl_diff'].apply(lambda row: is_blk_greater_than_stl(row))

In [39]:
def is_3s_greater_than_2s(diff):
    if diff < 0:
        return '2s'
    else:
        return '3s'

In [40]:
def is_assist_greater_than_tov(diff):
    if diff < 0:
        return 'tov'
    else:
        return 'ass'

In [41]:
df['3s_than_2s'] = df['fg3'] - (df['fg'] - df['fg3'])
df['ast_than_tov'] = df['ast'] - df['tov']

In [42]:
df['more_3s_than_2s'] = df['3s_than_2s'].apply(lambda row: is_3s_greater_than_2s(row))
df['more_ast_than_tov'] = df['ast_than_tov'].apply(lambda row: is_assist_greater_than_tov(row))

In [43]:
df.drop(['blk_stl_diff', '3s_than_2s', 'ast_than_tov'], axis=1, inplace=True)

In [44]:
cats = ['tot_time_label', 'orb_label', 'drb_label', 'more_3s_than_2s', 'more_ast_than_tov', 'pf_label', 'blk_stl_diff_label', 'pts_label', 'game_res_wl', 'gs_label_thirds', '3pct_label', 'fg_pct_label', 'game_location', 'ft_pct_label']

In [45]:
# Categoricals

data = []
cols = ['Feature', 'F', 'P', 'Rejected?']
statistically_sign_cols = []

def anova_helper(df, cat_col, num_col='plus_minus'):
    uniques = df[cat_col].unique()
    data = []
    for col in uniques:
        cur_df = (df[cat_col] == col)
        cur_df = df[cur_df][num_col]
        data.append(cur_df.tolist())
        
    f, p = stats.f_oneway(*data)
    return f, p

ALPHA = 0.05

for feature in cats:
    f, p = anova_helper(df, feature)
    row = [feature, f, p, p < ALPHA]
    
    if p < ALPHA:
        statistically_sign_cols.append(feature)
        
    data.append(row)

cats_df = pd.DataFrame(data=data, columns=cols)
cats_df

Unnamed: 0,Feature,F,P,Rejected?
0,tot_time_label,10.736022,0.001102766,True
1,orb_label,0.168578,0.6815053,False
2,drb_label,15.802213,7.765439e-05,True
3,more_3s_than_2s,28.184194,1.484461e-07,True
4,more_ast_than_tov,11.947589,0.0005803844,True
5,pf_label,15.518935,8.990416e-05,True
6,blk_stl_diff_label,12.458172,0.0004435211,True
7,pts_label,34.302074,7.271492e-09,True
8,game_res_wl,713.64811,8.351023e-109,True
9,gs_label_thirds,0.301792,0.7395883,False


In [46]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,...,gmsc_per_pt_times_ppp,gmsc_per,gmsc_ppp_mul,gmsc_ppp_div,per_ppp,per_per_pt,per_per_fg,per_per_fg_times_ppp,per_per_pt_times_ppp,game_result_per_pt
0,-0.341324,-0.985658,0.872014,-1.52063,-1.920755,-2.148487,-1.282483,-1.329787,-2.187826,0.344512,...,0.702769,-0.503719,-0.432505,-0.221905,-0.18761,0.709977,-0.072135,-0.22736,0.383659,-0.265684
1,-0.946102,-1.569041,0.664808,-0.66859,-1.387974,1.2606,-1.282483,-1.329787,-2.187826,-0.807708,...,-0.684254,-1.022292,-1.001151,-1.625553,-1.703504,-0.59408,-0.959891,-0.831071,-0.606098,-2.123694
2,-1.550881,-2.152424,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-1.329787,-2.187826,-0.807708,...,2.02893,-0.723011,-0.723597,-0.940439,-0.421669,2.768128,2.28532,1.37581,1.945763,1.014836
3,-2.15566,-2.346885,-2.067237,-1.52063,-1.920755,-2.148487,-0.231949,-0.014582,-0.064429,-0.807708,...,-1.679225,-1.134506,-1.406686,-2.473089,-2.28188,-0.823049,-0.123299,-0.625203,-0.973775,-6.095818
4,-1.248492,-1.763502,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-0.672184,-2.187826,0.344512,...,-1.617167,-1.111842,-1.332549,-1.935767,-1.880352,-0.931788,-1.353344,-1.294805,-1.095231,-1.713593


In [47]:
statistically_sign_cols

['tot_time_label',
 'drb_label',
 'more_3s_than_2s',
 'more_ast_than_tov',
 'pf_label',
 'blk_stl_diff_label',
 'pts_label',
 'game_res_wl',
 '3pct_label',
 'fg_pct_label',
 'game_location',
 'ft_pct_label']

In [48]:
for col in statistically_sign_cols:
    df_X[col] = df[col]

In [49]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,...,more_3s_than_2s,more_ast_than_tov,pf_label,blk_stl_diff_label,pts_label,game_res_wl,3pct_label,fg_pct_label,game_location,ft_pct_label
0,-0.341324,-0.985658,0.872014,-1.52063,-1.920755,-2.148487,-1.282483,-1.329787,-2.187826,0.344512,...,2s,ass,no,stl,below,L,below,above,home,below
1,-0.946102,-1.569041,0.664808,-0.66859,-1.387974,1.2606,-1.282483,-1.329787,-2.187826,-0.807708,...,2s,ass,yes,stl,below,L,above,above,away,below
2,-1.550881,-2.152424,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-1.329787,-2.187826,-0.807708,...,2s,ass,yes,stl,below,W,above,above,home,below
3,-2.15566,-2.346885,-2.067237,-1.52063,-1.920755,-2.148487,-0.231949,-0.014582,-0.064429,-0.807708,...,2s,ass,yes,blk,below,L,below,below,home,below
4,-1.248492,-1.763502,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-0.672184,-2.187826,0.344512,...,2s,ass,yes,blk,below,L,above,above,away,below


In [50]:
for cat in statistically_sign_cols:
    binarizer = LabelBinarizer()
    data = binarizer.fit_transform(df_X[cat].values)  
    df_X[cat] = data

In [51]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,...,more_3s_than_2s,more_ast_than_tov,pf_label,blk_stl_diff_label,pts_label,game_res_wl,3pct_label,fg_pct_label,game_location,ft_pct_label
0,-0.341324,-0.985658,0.872014,-1.52063,-1.920755,-2.148487,-1.282483,-1.329787,-2.187826,0.344512,...,0,0,0,1,1,0,1,0,1,1
1,-0.946102,-1.569041,0.664808,-0.66859,-1.387974,1.2606,-1.282483,-1.329787,-2.187826,-0.807708,...,0,0,1,1,1,0,0,0,0,1
2,-1.550881,-2.152424,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-1.329787,-2.187826,-0.807708,...,0,0,1,1,1,1,0,0,1,1
3,-2.15566,-2.346885,-2.067237,-1.52063,-1.920755,-2.148487,-0.231949,-0.014582,-0.064429,-0.807708,...,0,0,1,0,1,0,1,1,1,1
4,-1.248492,-1.763502,0.235048,-1.09461,-1.654364,0.407051,-1.282483,-0.672184,-2.187826,0.344512,...,0,0,1,0,1,0,0,0,0,1


# Testing

## LinearRegression

In [52]:
lin_reg = LinearRegression()

In [53]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, train_size=0.8)

In [54]:
X_train.head(), y_train.head()

(         fg       fga    fg_pct      fg3      fg3a   fg3_pct        ft  \
 0 -0.341324 -0.985658  0.872014 -1.52063 -1.920755 -2.148487 -1.282483   
 1 -0.946102 -1.569041  0.664808 -0.66859 -1.387974  1.260600 -1.282483   
 2 -1.550881 -2.152424  0.235048 -1.09461 -1.654364  0.407051 -1.282483   
 3 -2.155660 -2.346885 -2.067237 -1.52063 -1.920755 -2.148487 -0.231949   
 4 -1.248492 -1.763502  0.235048 -1.09461 -1.654364  0.407051 -1.282483   
 
         fta    ft_pct       orb  ...  more_3s_than_2s  more_ast_than_tov  \
 0 -1.329787 -2.187826  0.344512  ...                0                  0   
 1 -1.329787 -2.187826 -0.807708  ...                0                  0   
 2 -1.329787 -2.187826 -0.807708  ...                0                  0   
 3 -0.014582 -0.064429 -0.807708  ...                0                  0   
 4 -0.672184 -2.187826  0.344512  ...                0                  0   
 
    pf_label  blk_stl_diff_label  pts_label  game_res_wl  3pct_label  \
 0         0

In [55]:
def scores_info(scores):
    print(scores.mean())
    print(scores.std())
    print(scores)

In [56]:
scores = cross_val_score(lin_reg, df_X, df_y, cv=5)
scores_info(scores)

0.6670210723944485
0.05469703364337721
[0.58716216 0.74468611 0.67344932 0.70025383 0.62955395]


## SVR

In [57]:
svr = SVR()

In [58]:
param_grid = [
    {
        'C': [0.1, 0.5, 1, 2.5, 5],
        'kernel': ['poly', 'rbf', 'sigmoid', 'linear'],
        'degree': [3, 6, 9],
        'gamma': ['auto', 'scale'], 
        'epsilon': [0, 0.1, 0.5]
    }
]

In [59]:
fe_gscv_svr = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', refit='neg_mean_squared_error')

In [60]:
fe_gscv_svr.fit(df_X, df_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5], 'degree': [3, 6, 9],
                          'epsilon': [0, 0.1, 0.5], 'gamma': ['auto', 'scale'],
                          'kernel': ['poly', 'rbf', 'sigmoid', 'linear']}],
             pre_dispatch='2*n_jobs', refit='neg_mean_squared_error',
             return_train_score=False, scoring='neg_mean_squared_error',
             verbose=0)

In [61]:
fe_gscv_svr.best_params_

{'C': 0.5, 'degree': 3, 'epsilon': 0.5, 'gamma': 'auto', 'kernel': 'linear'}

In [62]:
fe_gscv_svr.best_score_

-51.776918316637705

### Random Forest Regressor

In [63]:
rfr = RandomForestRegressor(random_state=42)

In [64]:
param_grid = [
    {
        'n_estimators': [100, 300, 500, 750],
        'max_features': ['sqrt', 'log2'], 
        'min_impurity_split': [0.0, 1e-7]
    }
]

In [65]:
fe_gscv_rfr = GridSearchCV(rfr, param_grid, cv=5, scoring='neg_mean_squared_error', refit='neg_mean_squared_error')

In [66]:
fe_gscv_rfr.fit(df_X, df_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': ['sqrt', 'log2']

In [67]:
fe_gscv_rfr.best_params_

{'max_features': 'sqrt', 'min_impurity_split': 0.0, 'n_estimators': 500}

In [68]:
fe_gscv_rfr.best_score_

-57.16235799713877

### SGD

In [69]:
sgd = SGDRegressor(random_state=42)

In [70]:
param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'epsilon': [0.0, 0.1, 0.5], 
        'learning_rate': ['constant', 'optimal'],
        'eta0': [0.001, 0.01, 0.025, 0.05]
    }
]

In [71]:
fe_gscv_sgd = GridSearchCV(sgd, param_grid, cv=5, scoring='neg_mean_squared_error', refit='neg_mean_squared_error')

In [72]:
fe_gscv_sgd.fit(df_X, df_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SGDRegressor(alpha=0.0001, average=False,
                                    early_stopping=False, epsilon=0.1,
                                    eta0=0.01, fit_intercept=True,
                                    l1_ratio=0.15, learning_rate='invscaling',
                                    loss='squared_loss', max_iter=1000,
                                    n_iter_no_change=5, penalty='l2',
                                    power_t=0.25, random_state=42, shuffle=True,
                                    tol=0.001, validation_fraction=0.1,
                                    verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'epsilon': [0.0, 0.1, 0.5],
                          'eta0': [0.001, 0.01, 0.025, 0.05],
                          'learning_rate': ['constant', 'optimal'],
                          'penalty': ['l1', 'l2']}],
             pre_dispatch='2*n

In [97]:
fe_gscv_sgd.best_params_

{'epsilon': 0.0, 'eta0': 0.001, 'learning_rate': 'constant', 'penalty': 'l1'}

In [74]:
fe_gscv_sgd.best_score_

-50.13032467492655

### Lasso Regression

In [75]:
lasso = Lasso(random_state=42)

In [76]:
scores = cross_val_score(lasso, df_X, df_y, cv=5)

In [77]:
preds = cross_val_predict(lasso, df_X, df_y, cv=5)
# preds

In [78]:
mean_squared_error(preds, df_y)

51.04541572745241

In [79]:
scores_info(scores)

0.6902301027549014
0.03447763773908558
[0.67111415 0.75897942 0.66927321 0.67508966 0.67669407]


### Ridge Regression

In [80]:
ridge = Ridge(random_state=42)

scores = cross_val_score(ridge, df_X, df_y, cv=5)
preds = cross_val_predict(ridge, df_X, df_y, cv=5)

print(mean_squared_error(preds, df_y))
scores_info(scores)

51.20324569956847
0.6907113755321346
0.038185899218611145
[0.65848606 0.76064765 0.67549762 0.70040006 0.65852549]


### ElasticNet Regression

In [81]:
elasticnet = ElasticNet(random_state=42)

scores = cross_val_score(elasticnet, df_X, df_y, cv=5)
preds = cross_val_predict(elasticnet, df_X, df_y, cv=5)

print(mean_squared_error(preds, df_y))
scores_info(scores)

60.506244471331016
0.6329083545995063
0.03553281529563392
[0.58647096 0.69590884 0.61959937 0.62970189 0.63286072]


# Storing the Best Model (SGDRegressor)

In [93]:
pickle.dump(fe_gscv_sgd.best_estimator_, open('best_model.pickle', 'wb'))

In [94]:
# model = pickle.load(open('best_model.pickle', 'rb'))