In [60]:
import pandas as pd
from scipy import stats
import numpy as np

from sklearn.impute import SimpleImputer

In [61]:
df = pd.read_csv('sc_stats.csv')
df.head()

Unnamed: 0,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,pf,tov,pts,plus_minus
0,35:39,7,12,0.583,0,1,0.0,0,0,,1,1,2,7,4,0,2,2,14,7
1,39:05,5,9,0.556,2,3,0.667,0,0,,0,2,2,4,1,0,5,3,12,-19
2,28:27,3,6,0.5,1,2,0.5,0,0,,0,5,5,9,2,0,4,1,7,-4
3,21:32,1,5,0.2,0,1,0.0,3,4,0.75,0,1,1,3,0,0,6,0,5,-13
4,31:15,4,8,0.5,1,2,0.5,0,2,0.0,1,3,4,6,0,0,4,5,9,-5


In [62]:
cols = df.columns.tolist()
cols.remove('mp')

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 20 columns):
mp            699 non-null object
fg            699 non-null int64
fga           699 non-null int64
fg_pct        697 non-null float64
fg3           699 non-null int64
fg3a          699 non-null int64
fg3_pct       695 non-null float64
ft            699 non-null int64
fta           699 non-null int64
ft_pct        603 non-null float64
orb           699 non-null int64
drb           699 non-null int64
trb           699 non-null int64
ast           699 non-null int64
stl           699 non-null int64
blk           699 non-null int64
pf            699 non-null int64
tov           699 non-null int64
pts           699 non-null int64
plus_minus    699 non-null int64
dtypes: float64(3), int64(16), object(1)
memory usage: 109.3+ KB


In [64]:
df.isna().sum(axis=0)

mp             0
fg             0
fga            0
fg_pct         2
fg3            0
fg3a           0
fg3_pct        4
ft             0
fta            0
ft_pct        96
orb            0
drb            0
trb            0
ast            0
stl            0
blk            0
pf             0
tov            0
pts            0
plus_minus     0
dtype: int64

In [65]:
imputer = SimpleImputer(strategy='constant', fill_value=0)

In [66]:
df = pd.DataFrame(data=imputer.fit_transform(df), columns=df.columns)

In [67]:
df.isna().sum(axis=0)

mp            0
fg            0
fga           0
fg_pct        0
fg3           0
fg3a          0
fg3_pct       0
ft            0
fta           0
ft_pct        0
orb           0
drb           0
trb           0
ast           0
stl           0
blk           0
pf            0
tov           0
pts           0
plus_minus    0
dtype: int64

In [68]:
df_X = df.loc[:, 'fg': 'pts']
df_y = df['plus_minus']

In [69]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,pf,tov,pts
0,7,12,0.583,0,1,0.0,0,0,0.0,1,1,2,7,4,0,2,2,14
1,5,9,0.556,2,3,0.667,0,0,0.0,0,2,2,4,1,0,5,3,12
2,3,6,0.5,1,2,0.5,0,0,0.0,0,5,5,9,2,0,4,1,7
3,1,5,0.2,0,1,0.0,3,4,0.75,0,1,1,3,0,0,6,0,5
4,4,8,0.5,1,2,0.5,0,2,0.0,1,3,4,6,0,0,4,5,9


# Original Analysis

In [70]:
ALPHA = 0.05

In [71]:
cols = df_X.columns.tolist()

high_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

mid_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

high_corr_cutoff = 0.9
mid_corr_cutoff = 0.65

for i in range(len(cols) - 1):
    j = i + 1
    while j < len(cols):
        correl_signif = stats.pearsonr(df_X[cols[i]], df_X[cols[j]])
        
        if abs(correl_signif[0]) > high_corr_cutoff:
            high_info['feature1'].append(cols[i])
            high_info['feature2'].append(cols[j])
            high_info['r'].append(correl_signif[0])
        
        elif abs(correl_signif[0]) < high_corr_cutoff and abs(correl_signif[0]) > mid_corr_cutoff:
            mid_info['feature1'].append(cols[i])
            mid_info['feature2'].append(cols[j])
            mid_info['r'].append(correl_signif[0])
        
        j += 1
        
high_info_df = pd.DataFrame(data=high_info)
print(f'Highly Correlated:\n\n{high_info_df}')

print('\n')

mid_info_df = pd.DataFrame(data=mid_info)
print('')
print(f'Mid Correlated:\n\n{mid_info_df}')

Highly Correlated:

  feature1 feature2         r
0       fg      pts  0.944255
1       ft      fta  0.979005
2      drb      trb  0.931680



Mid Correlated:

  feature1 feature2         r
0       fg      fga  0.799839
1       fg   fg_pct  0.674855
2       fg      fg3  0.738251
3      fga     fg3a  0.745016
4      fga      pts  0.766746
5      fg3     fg3a  0.788780
6      fg3      pts  0.808183
7     fg3a      pts  0.695964


In [72]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

for col in df_X.columns:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('Plus Minus')
    label_corr_info['r'].append(correl_signif[0])
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

   feature1    feature2         r
0        fg  Plus Minus  0.222720
1       fga  Plus Minus  0.054447
2    fg_pct  Plus Minus  0.318048
3       fg3  Plus Minus  0.298575
4      fg3a  Plus Minus  0.229519
5   fg3_pct  Plus Minus  0.232160
6        ft  Plus Minus  0.158737
7       fta  Plus Minus  0.156827
8    ft_pct  Plus Minus  0.165189
9       orb  Plus Minus  0.005484
10      drb  Plus Minus  0.191611
11      trb  Plus Minus  0.178597
12      ast  Plus Minus  0.242444
13      stl  Plus Minus  0.162664
14      blk  Plus Minus  0.046458
15       pf  Plus Minus -0.124485
16      tov  Plus Minus -0.133273
17      pts  Plus Minus  0.272230


# Feature Engineering

### New Columns

In [84]:
df_X['ast_tov'] = df_X['ast'].div(df_X['tov'].replace(0, np.inf))
df_X['stl_tov'] = df_X['stl'].div(df_X['tov'].replace(0, np.inf))
df_X['drb_trb'] = df_X['drb'].div(df_X['trb'].replace(0, np.inf))
df_X['orb_trb'] = df_X['orb'].div(df_X['trb'].replace(0, np.inf))

df_X['1/pts'] = df_X['ft'].div(df_X['pts'].replace(0, np.inf))
df_X['3/pts'] = (df_X['fg3'] * 3).div(df_X['pts'].replace(0, np.inf))
df_X['2/pts'] = ( (df_X['fg'] - df_X['fg3']) * 2).div(df_X['pts'].replace(0, np.inf))

df_X['stl_pf'] = df_X['stl'].div(df_X['pf'].replace(0, np.inf))
df_X['blk_pf'] = df_X['blk'].div(df_X['pf'].replace(0, np.inf))

In [87]:
new_cols = ['ast_tov', 'drb_trb', 'orb_trb', '1/pts', '2/pts', '3/pts', 'stl_pf', 'blk_pf', 'stl_tov']

In [88]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

for col in new_cols:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('Plus Minus')
    label_corr_info['r'].append(correl_signif[0])
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

  feature1    feature2         r
0  ast_tov  Plus Minus  0.185919
1  drb_trb  Plus Minus  0.110192
2  orb_trb  Plus Minus -0.050412
3    1/pts  Plus Minus  0.018535
4    2/pts  Plus Minus -0.248048
5    3/pts  Plus Minus  0.251225
6   stl_pf  Plus Minus  0.123495
7   blk_pf  Plus Minus  0.021191
8  stl_tov  Plus Minus  0.166153
