In [1]:
import pandas as pd
from scipy import stats
import numpy as np

from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('sc_stats.csv')
df.head()

Unnamed: 0,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,pf,tov,pts,plus_minus
0,35:39,7,12,0.583,0,1,0.0,0,0,,1,1,2,7,4,0,2,2,14,7
1,39:05,5,9,0.556,2,3,0.667,0,0,,0,2,2,4,1,0,5,3,12,-19
2,28:27,3,6,0.5,1,2,0.5,0,0,,0,5,5,9,2,0,4,1,7,-4
3,21:32,1,5,0.2,0,1,0.0,3,4,0.75,0,1,1,3,0,0,6,0,5,-13
4,31:15,4,8,0.5,1,2,0.5,0,2,0.0,1,3,4,6,0,0,4,5,9,-5


In [3]:
def convert_mp(mp):
    minutes, sec = mp.split(':')
    return int(minutes) + (int(sec) / 60)

In [4]:
df['tot_time'] = df['mp'].apply(lambda row: convert_mp(row))

In [5]:
cols = df.columns.tolist()
cols.remove('mp')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 21 columns):
mp            699 non-null object
fg            699 non-null int64
fga           699 non-null int64
fg_pct        697 non-null float64
fg3           699 non-null int64
fg3a          699 non-null int64
fg3_pct       695 non-null float64
ft            699 non-null int64
fta           699 non-null int64
ft_pct        603 non-null float64
orb           699 non-null int64
drb           699 non-null int64
trb           699 non-null int64
ast           699 non-null int64
stl           699 non-null int64
blk           699 non-null int64
pf            699 non-null int64
tov           699 non-null int64
pts           699 non-null int64
plus_minus    699 non-null int64
tot_time      699 non-null float64
dtypes: float64(4), int64(16), object(1)
memory usage: 114.8+ KB


In [7]:
df.isna().sum(axis=0)

mp             0
fg             0
fga            0
fg_pct         2
fg3            0
fg3a           0
fg3_pct        4
ft             0
fta            0
ft_pct        96
orb            0
drb            0
trb            0
ast            0
stl            0
blk            0
pf             0
tov            0
pts            0
plus_minus     0
tot_time       0
dtype: int64

In [8]:
imputer = SimpleImputer(strategy='constant', fill_value=0)

In [9]:
df = pd.DataFrame(data=imputer.fit_transform(df), columns=df.columns)

In [10]:
df.isna().sum(axis=0)

mp            0
fg            0
fga           0
fg_pct        0
fg3           0
fg3a          0
fg3_pct       0
ft            0
fta           0
ft_pct        0
orb           0
drb           0
trb           0
ast           0
stl           0
blk           0
pf            0
tov           0
pts           0
plus_minus    0
tot_time      0
dtype: int64

In [11]:
df_X = df.loc[:, 'fg': 'tot_time']
df_X.drop('plus_minus', axis=1, inplace=True)

df_y = df['plus_minus']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
df_X.head()

Unnamed: 0,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,fta,ft_pct,orb,drb,trb,ast,stl,blk,pf,tov,pts,tot_time
0,7,12,0.583,0,1,0.0,0,0,0.0,1,1,2,7,4,0,2,2,14,35.65
1,5,9,0.556,2,3,0.667,0,0,0.0,0,2,2,4,1,0,5,3,12,39.0833
2,3,6,0.5,1,2,0.5,0,0,0.0,0,5,5,9,2,0,4,1,7,28.45
3,1,5,0.2,0,1,0.0,3,4,0.75,0,1,1,3,0,0,6,0,5,21.5333
4,4,8,0.5,1,2,0.5,0,2,0.0,1,3,4,6,0,0,4,5,9,31.25


# Original Analysis

In [13]:
ALPHA = 0.05

In [14]:
cols = df_X.columns.tolist()

high_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

mid_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

high_corr_cutoff = 0.9
mid_corr_cutoff = 0.65

for i in range(len(cols) - 1):
    j = i + 1
    while j < len(cols):
        correl_signif = stats.pearsonr(df_X[cols[i]], df_X[cols[j]])
        
        if abs(correl_signif[0]) > high_corr_cutoff:
            high_info['feature1'].append(cols[i])
            high_info['feature2'].append(cols[j])
            high_info['r'].append(correl_signif[0])
        
        elif abs(correl_signif[0]) < high_corr_cutoff and abs(correl_signif[0]) > mid_corr_cutoff:
            mid_info['feature1'].append(cols[i])
            mid_info['feature2'].append(cols[j])
            mid_info['r'].append(correl_signif[0])
        
        j += 1
        
high_info_df = pd.DataFrame(data=high_info)
print(f'High Correlations:\n\n{high_info_df}')

print('\n')

mid_info_df = pd.DataFrame(data=mid_info)
print('')
print(f'Decent Correlations:\n\n{mid_info_df}')

High Correlations:

  feature1 feature2         r
0       fg      pts  0.944255
1       ft      fta  0.979005
2      drb      trb  0.931680



Decent Correlations:

  feature1 feature2         r
0       fg      fga  0.799839
1       fg   fg_pct  0.674855
2       fg      fg3  0.738251
3      fga     fg3a  0.745016
4      fga      pts  0.766746
5      fg3     fg3a  0.788780
6      fg3      pts  0.808183
7     fg3a      pts  0.695964


In [15]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

for col in df_X.columns:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('plus minus')
    label_corr_info['r'].append(correl_signif[0])
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

    feature1    feature2         r
0         fg  plus minus  0.222720
1        fga  plus minus  0.054447
2     fg_pct  plus minus  0.318048
3        fg3  plus minus  0.298575
4       fg3a  plus minus  0.229519
5    fg3_pct  plus minus  0.232160
6         ft  plus minus  0.158737
7        fta  plus minus  0.156827
8     ft_pct  plus minus  0.165189
9        orb  plus minus  0.005484
10       drb  plus minus  0.191611
11       trb  plus minus  0.178597
12       ast  plus minus  0.242444
13       stl  plus minus  0.162664
14       blk  plus minus  0.046458
15        pf  plus minus -0.124485
16       tov  plus minus -0.133273
17       pts  plus minus  0.272230
18  tot_time  plus minus -0.083537


# Feature Engineering

### New Columns

In [21]:
df_X['stl_tov'] = df_X['stl'].div(df_X['tov'].replace(0, np.inf))
df_X['stops/tov'] = (df_X['stl'] + df_X['blk']).div(df_X['tov'].replace(0, np.inf))
df_X['3/pts'] = df_X['fg3'].div(df_X['fg'].replace(0, np.inf))
df_X['3a'] = df_X['fg3a'].div(df_X['fga'].replace(0, np.inf))
df_X['fg3avg'] = df_X['fg3'] / df_X['fg3'].mean()
df_X['pts_avg'] = df_X['pts'] / df['pts'].mean()
df_X['fg_avg'] = df_X['fg'] / df['fg'].mean()
df_X['fg_pct_avg'] = df_X['fg_pct'] / df['fg_pct'].mean()
df_X['pts_per_min'] = (df_X['pts'] / df['tot_time']) * 60
df_X['pos_responsible_for'] = df_X['fg'] + df_X['ast'] + df_X['stl'] + df_X['blk'] - df['tov'] # possessions responsible for

df_X['efg'] = (df_X['fg'] + (0.5 * df_X['fg3'])) / df_X['fga'].replace(0, np.inf)
df_X['tov%'] = 100 * df_X['tov'] / (df_X['fga'] + 0.44 * df_X['fta'] + df_X['tov']).replace(0, np.inf)
df_X['true_shooting%'] = df_X['pts'] / (2 * (df_X['fga'] + 0.44 * df_X['fta'])).replace(0, np.inf)
df_X['ppp'] = df_X['pts'] / (df_X['fga'] + 0.44 * df_X['fta'] + df_X['tov']).replace(0, np.inf)

In [22]:
new_cols = ['net_pos', 'pos_responsible_for', 'pts_per_min', 'fg_avg', 'fg_pct_avg', 'pts_avg', 'ppp', 'true_shooting%', 'tov%', 'efg', 'fg3avg', '3a', 'stops/tov', '3/pts', 'stl_tov']

In [23]:
label_corr_info = {
    'feature1': [],
    'feature2': [],
    'r': []
}

for col in new_cols:
    correl_signif = stats.pearsonr(df_X[col], df_y)
    
    label_corr_info['feature1'].append(col)
    label_corr_info['feature2'].append('Plus Minus')
    label_corr_info['r'].append(correl_signif[0])
    
label_info_df = pd.DataFrame(label_corr_info)
print(label_info_df)

               feature1    feature2         r
0               net_pos  Plus Minus  0.246228
1   pos_responsible_for  Plus Minus  0.377271
2           pts_per_min  Plus Minus  0.365820
3                fg_avg  Plus Minus  0.222720
4            fg_pct_avg  Plus Minus  0.318048
5               pts_avg  Plus Minus  0.272230
6                   ppp  Plus Minus  0.412332
7        true_shooting%  Plus Minus  0.379701
8                  tov%  Plus Minus -0.185552
9                   efg  Plus Minus  0.367933
10               fg3avg  Plus Minus  0.298575
11                   3a  Plus Minus  0.320089
12            stops/tov  Plus Minus  0.173569
13                3/pts  Plus Minus  0.278758
14              stl_tov  Plus Minus  0.166153
