In [136]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from matplotlib import colors
from matplotlib.ticker import PercentFormatter
# regression analysis

import stargazer
import sys
import os
from pathlib import Path

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utils")

from players_season_builder import *


For the independent variables, we have:
    Player Quality – Estimated as being the constant in the regression model
    Home Court Advantage – A binary variable that indicates if the player is at home or on the road
    Opp. Quality – The opponent’s quality will be estimated by 2 variables: its offensive and its defensive efficiency (points allowed/scored per 100 possessions)
    Rest Level – To estimate a player’s rest level, we create variables to reflect the activity for the past 3 days. There are two approaches for these variables:
    Minutes – There are 3 variables (non-dummy) that reflect how many minutes the player was on court for each of the past 3 days
    Days-off – The variables reflect if the player entered a game, no matter how much he played (e.g., the dummy “011”, means that the team has played 2 and 3 days ago, and has not played yesterday)

This results in the following regression model (using the “minutes” approach for rest level):

    PER= α + C * H + ( C * OO + C * OD ) + ( C * M1 + C * M2 + C * M3 ) + nu + err

Where:
    PER = Player Efficiency Rating (for the game)
    α = Constant (that can be interpreted as a player’s baseline performance)
    H = Binary variable that is 1 if the player is playing at home, and 0 if he is playing on the road
    OO = Opponent’s offensive efficiency during the season
    OD = Opponent’s team defensive efficiency during the season
    M1/M2/M3 = Number of minutes the player has played 1, 2, and 3 days before the game
    nu = PER for all other games in the season
    


In [8]:
player_path = r"\Users\sebas\Desktop\UChicago - Q6\Sports Analytics\sports_analytics_project\data\BDB_Player.xlsx"
df_player = pd.read_excel(player_path)

In [9]:
df_player.columns

Index(['DATASET', 'DATE', 'PLAYER FULL NAME', 'POSITION', 'OWN TEAM',
       'OPP TEAM', 'VENUE', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR',
       'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'PER', 'DATE-DIFF',
       'RR VAL', 'RR SERIES', 'AVG_PER', 'OMIT_PER', 'SAME CITY', 'TRAVEL',
       '1.0 days', '10.0 days', '11.0 days', '12.0 days', '13.0 days',
       '14+ days', '14.0 days', '2.0 days', '3.0 days', '4.0 days', '5.0 days',
       '6.0 days', '7.0 days', '8.0 days', '9.0 days', 'Season Start days',
       'H', 'R', 'H-M1', 'H-M2', 'H-M3', 'R-M1', 'R-M2', 'R-M3', 'M1', 'M2',
       'M3', 'OEFF - All games', 'DEFF - All games', 'OEFF - Not included',
       'DEFF - Not included', 'Bubble'],
      dtype='object')

In [12]:
df_player.columns = ['DATASET', 'DATE', 'PLAYER FULL NAME', 'POSITION', 'OWN TEAM',
       'OPP TEAM', 'VENUE', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR',
       'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'PER', 'DATE-DIFF',
       'RR VAL', 'RR SERIES', 'S_PER', 'I_PER', 'SAME CITY', 'TRAVEL',
       '1.0 days', '10.0 days', '11.0 days', '12.0 days', '13.0 days',
       '14+ days', '14.0 days', '2.0 days', '3.0 days', '4.0 days', '5.0 days',
       '6.0 days', '7.0 days', '8.0 days', '9.0 days', 'Season Start days',
       'H', 'R', 'H-M1', 'H-M2', 'H-M3', 'R-M1', 'R-M2', 'R-M3', 'M1', 'M2',
       'M3', 'S_OEFF', 'S_DEFF', 'I_OEFF', 'I_DEFF','Bubble']
print(df_player['DATASET'].unique())

['2006-2007 Regular Season' '2007-2008 Regular Season'
 '2008-2009 Regular Season' '2009-2010 Regular Season'
 '2010-2011 Regular Season' '2011-2012 Regular Season'
 '2012-2013 Regular Season' '2013-2014 Regular Season'
 '2014-2015 Regular Season' '2015-2016 Regular Season'
 '2016-2017 Regular Season' '2017-2018 Regular Season'
 '2018-2019 Regular Season' '2019-2020 Regular Season']


In [24]:
# Remove 2011-2012 Season
df_clean = df_player[df_player['DATASET'] != '2011-2012 Regular Season']
# Remove Bubble Seasons (2019-2020)
df_clean = df_player[df_player['DATASET'] != '2019-2020 Regular Season']
# Create PER Difference Column
df_clean['PER_DIFF'] = df_clean['PER'] - df_clean['I_PER']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['PER_DIFF'] = df_clean['PER'] - df_clean['I_PER']


In [120]:
# LHS
y = np.array(df_clean[['PER_DIFF']])
y_per = np.array(df_clean[['PER']])
# Define RHS
x = np.array(
    df_clean[[
       'H', 'TRAVEL', 'I_OEFF','I_DEFF', 'M1', 'M2', 'M3']]
)
x_glp = np.array(
    df_clean[[
       'H', 'TRAVEL', 'I_OEFF','I_DEFF', '1.0 days', '2.0 days', '3.0 days', '4.0 days']]
)
x_combo = np.array(
    df_clean[[
       'H', 'TRAVEL', 'I_OEFF','I_DEFF', 'M1', 'M2', 'M3','1.0 days', '2.0 days', '3.0 days', '4.0 days'
       ]]
)

x2 = np.array(
    df_clean[[
       'H', 'TRAVEL', 'I_OEFF','I_DEFF', 'I_PER','M1', 'M2', 'M3']]
)
x2_glp = np.array(
    df_clean[[
       'H', 'TRAVEL', 'I_OEFF','I_DEFF','I_PER', '1.0 days', '2.0 days', '3.0 days', '4.0 days']]
)
x2_combo = np.array(
    df_clean[[
       'H', 'TRAVEL', 'I_OEFF','I_DEFF', 'I_PER','M1', 'M2', 'M3','1.0 days', '2.0 days', '3.0 days', '4.0 days'
       ]]
)


#x = sm.add_constant(x)
#x_glp = sm.add_constant(x_glp)
#x_combo = sm.add_constant(x_combo)

x2 = sm.add_constant(x2)
x2_glp = sm.add_constant(x2_glp)
x2_combo = sm.add_constant(x2_combo)

In [121]:
player_per_model = sm.OLS(y , x)
results = player_per_model.fit()

player_lgp_mdel = sm.OLS(y, x_glp)
results_lgp = player_lgp_mdel.fit()

player_combo_mdel = sm.OLS(y, x_combo)
results_combo = player_combo_mdel.fit()

df_model = pd.DataFrame(
    { 
      'coeff' : ['H', 'TRAVEL', 'I_OEFF','I_DEFF', 'M1', 'M2', 'M3'],
      'values' : results.params,
      't_value' : results.tvalues
    })

df_lgp = pd.DataFrame(
  { 
      'coeff' : ['H', 'TRAVEL', 'I_OEFF','I_DEFF', '1.0 days', '2.0 days', '3.0 days', '4.0 days'],
      'values' : results_lgp.params,
      't_value' : results_lgp.tvalues
    }
)

df_combo = pd.DataFrame(
  { 
      'coeff' : ['H', 'TRAVEL', 'I_OEFF','I_DEFF', 'M1', 'M2', 'M3','1.0 days', '2.0 days', '3.0 days', '4.0 days'],
      'values' : results_combo.params,
      't_value' : results_combo.tvalues
    }
)



In [122]:
df_model['model'] = 'Minutes'
df_model['R_Squared'] = results.rsquared

df_lgp['model'] = 'LGP'
df_model['R_Squared'] = results_lgp.rsquared

df_combo['model'] = 'Minutes_LGP'
df_model['R_Squared'] = results_combo.rsquared


In [123]:
values_res = pd.concat(
    [df_model,
    df_lgp,
    df_combo],
    axis=0
).pivot(
    index = 'model',
    columns = 'coeff'
)['values']
tval_res = pd.concat(
    [df_model,
    df_lgp,
    df_combo],
    axis=0
).pivot(
    index = 'model',
    columns = 'coeff'
)['t_value']

In [124]:
values_res.fillna('-',inplace=True)
values_res = values_res.reindex(columns = ['H','TRAVEL','I_OEFF','I_DEFF','M1','M2','M3','1.0 days','2.0 days','3.0 days','4.0 days'])
tval_res.fillna('-',inplace=True)
tval_res = tval_res.reindex(columns = ['H','TRAVEL','I_OEFF','I_DEFF','M1','M2','M3','1.0 days','2.0 days','3.0 days','4.0 days'])


In [125]:
results.rsquared

0.006929444261609352

In [126]:
print()
values_res




coeff,H,TRAVEL,I_OEFF,I_DEFF,M1,M2,M3,1.0 days,2.0 days,3.0 days,4.0 days
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LGP,0.640571,-0.025992,-0.094503,0.091208,-,-,-,-0.065751,0.122708,0.026035,0.124408
Minutes,0.639434,-0.028695,-0.094354,0.091327,-0.002488,0.002958,-0.000183,-,-,-,-
Minutes_LGP,0.639869,-0.026301,-0.094546,0.091084,-0.000763,0.003633,-0.000142,-0.019104,0.019587,0.044616,0.118117


In [116]:
tval_res

coeff,Intercept,H,TRAVEL,I_OEFF,I_DEFF,M1,M2,M3,1.0 days,2.0 days,3.0 days,4.0 days
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LGP,-14.161372,14.867854,0.454785,-8.027648,26.564057,-,-,-,0.117995,2.865778,1.035504,1.903069
Minutes,-14.144014,14.799669,0.312921,-7.988741,26.581656,-0.625585,3.012341,0.625145,-,-,-,-
Minutes_LGP,-14.214121,14.797988,0.390837,-7.985118,26.571383,0.27317,1.644778,0.473074,-0.084553,0.286637,0.910724,1.849732


In [101]:
player_per_model2 = sm.OLS(y_per , x2)
results2 = player_per_model2.fit()

player_lgp_mdel2 = sm.OLS(y_per, x2_glp)
results_lgp2 = player_lgp_mdel2.fit()

player_combo_mdel2 = sm.OLS(y_per, x2_combo)
results_combo2 = player_combo_mdel2.fit()

df_model2 = pd.DataFrame(
    { 
      'coeff' : ['Intercept','H', 'TRAVEL', 'I_OEFF','I_DEFF','I_PER', 'M1', 'M2', 'M3'],
      'values' : results2.params,
      't_value' : results2.tvalues
    })

df_lgp2 = pd.DataFrame(
  { 
      'coeff' : ['Intercept','H', 'TRAVEL', 'I_OEFF','I_DEFF', 'I_PER','1.0 days', '2.0 days', '3.0 days', '4.0 days'],
      'values' : results_lgp2.params,
      't_value' : results_lgp2.tvalues
    }
)

df_combo2 = pd.DataFrame(
  { 
      'coeff' : ['Intercept','H', 'TRAVEL', 'I_OEFF','I_DEFF', 'I_PER', 'M1', 'M2', 'M3','1.0 days', '2.0 days', '3.0 days', '4.0 days'],
      'values' : results_combo2.params,
      't_value' : results_combo2.tvalues
    }
)

In [102]:
df_model2['model'] = 'Minutes'
df_model2['R_Squared'] = results2.rsquared

df_lgp2['model'] = 'LGP'
df_model2['R_Squared'] = results_lgp2.rsquared

df_combo2['model'] = 'Minutes_LGP'
df_model2['R_Squared'] = results_combo2.rsquared

In [106]:
values_res2 = pd.concat(
    [df_model2,
    df_lgp2,
    df_combo2],
    axis=0
).pivot(
    index = 'model',
    columns = 'coeff'
)['values']
tval_res2 = pd.concat(
    [df_model2,
    df_lgp2,
    df_combo2],
    axis=0
).pivot(
    index = 'model',
    columns = 'coeff'
)['t_value']

values_res2.fillna('-',inplace=True)
values_res2 = values_res2.reindex(columns = ['Intercept','H','TRAVEL','I_OEFF','I_DEFF','I_PER','M1','M2','M3','1.0 days','2.0 days','3.0 days','4.0 days'])
tval_res2.fillna('-',inplace=True)
tval_res2 = tval_res2.reindex(columns = ['Intercept','H','TRAVEL','I_OEFF','I_DEFF','I_PER','M1','M2','M3','1.0 days','2.0 days','3.0 days','4.0 days'])


In [109]:
print(    [results2.rsquared,
    results_lgp2.rsquared,
    results_combo2.rsquared])
values_res2

[0.20325828861597173, 0.20325537774445634, 0.20329295114039359]


coeff,Intercept,H,TRAVEL,I_OEFF,I_DEFF,I_PER,M1,M2,M3,1.0 days,2.0 days,3.0 days,4.0 days
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LGP,-11.681857,0.682442,0.023812,-0.040695,0.151684,0.934486,-,-,-,0.082947,0.253428,0.14017,0.203151
Minutes,-11.698864,0.678572,0.010885,-0.040335,0.151999,0.934135,0.000256,0.005921,0.00186,-,-,-,-
Minutes_LGP,-11.784737,0.680808,0.017809,-0.040338,0.151919,0.933654,0.002516,0.007276,0.001361,0.023071,0.05019,0.132932,0.199098


In [108]:
tval_res2

coeff,Intercept,H,TRAVEL,I_OEFF,I_DEFF,I_PER,M1,M2,M3,1.0 days,2.0 days,3.0 days,4.0 days
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
LGP,-13.762888,14.909923,0.446227,-7.683488,26.882742,178.507484,-,-,-,1.172938,4.09529,1.920929,2.224374
Minutes,-13.771703,14.803958,0.204044,-7.611682,26.929397,178.299463,0.153337,4.293433,1.523525,-,-,-,-
Minutes_LGP,-13.860529,14.826736,0.332094,-7.611661,26.910877,177.966044,0.544441,2.303916,0.934083,0.142784,0.466107,1.56051,2.174001


In [152]:

#test

AttributeError: 'numpy.ndarray' object has no attribute 'index'