In [127]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from matplotlib import colors
from matplotlib.ticker import PercentFormatter
# regression analysis
from linearmodels import IV2SLS 

import sys
import os
from pathlib import Path

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utils")

from players_season_builder import *


In [105]:
player_path = r"\Users\sebas\Desktop\UChicago - Q6\Sports Analytics\sports_analytics_project\data\BDB_Player.xlsx"
team_path = r"\Users\sebas\Desktop\UChicago - Q6\Sports Analytics\sports_analytics_project\data\BDB_Combo.xlsx"

df_player = pd.read_excel(player_path)
df_team = pd.read_excel(team_path)

In [106]:
df_team.columns
df_team = df_team[['DATASET', 'DATE', 'TEAMS', 'VENUE','OEFF', 'DEFF', 'REST DAYS', 'DATE-DIFF']]
df_team.rename(columns = {
    'TEAMS':'OWN TEAM',
    'OEFF': 'TEAM_OEFF',
    'DEFF':'TEAM_DEFF',
    'REST DAYS': 'TEAM_REST_DAYS',
    'DATE-DIFF' : 'TEAM_DATE_DIFF'
}, inplace = True )


In [122]:
t_df = pd.merge(
    df_player,
    df_team,
    on = ['DATASET', 'DATE','OWN TEAM','VENUE']
)

In [123]:
# Add Team Dummies for TEAM_DATE_DIFF
t_df = pd.concat([
    t_df,
    pd.get_dummies(
        t_df['TEAM_DATE_DIFF'],
        prefix = "T_REST"
        )
], axis=1)
t_df.columns

Index(['DATASET', 'DATE', 'PLAYER FULL NAME', 'POSITION', 'OWN TEAM',
       'OPP TEAM', 'VENUE', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR',
       'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'PER', 'DATE-DIFF',
       'RR VAL', 'RR SERIES', 'AVG_PER', 'OMIT_PER', 'SAME CITY', 'TRAVEL',
       '1.0 days', '10.0 days', '11.0 days', '12.0 days', '13.0 days',
       '14+ days', '14.0 days', '2.0 days', '3.0 days', '4.0 days', '5.0 days',
       '6.0 days', '7.0 days', '8.0 days', '9.0 days', 'Season Start days',
       'H', 'R', 'H-M1', 'H-M2', 'H-M3', 'R-M1', 'R-M2', 'R-M3', 'M1', 'M2',
       'M3', 'OEFF - All games', 'DEFF - All games', 'OEFF - Not included',
       'DEFF - Not included', 'Bubble', 'TEAM_OEFF', 'TEAM_DEFF',
       'TEAM_REST_DAYS', 'TEAM_DATE_DIFF', 'T_REST_1.0 days',
       'T_REST_10.0 days', 'T_REST_11.0 days', 'T_REST_14+ days',
       'T_REST_2.0 days', 'T_REST_3.0 days', 'T_REST_4.0 days',
       'T_REST_5.0 days', 'T_REST_6.0 days', 'T_REST_7.0 day

In [124]:
t_df['PER_DIFF'] = t_df['PER'] - t_df['OMIT_PER']

In [130]:
y_m1 = np.array(t_df[['M1']])
y_m2 = np.array(t_df[['M2']])
y_m3 = np.array(t_df[['M3']])

x = np.array(
   t_df.loc[:, t_df.columns.str.startswith("T_REST")]
)

# Gross
#iv_reg = IV2SLS.from_formula(
#   "PER_DIFF ~ 1 + H + TRAVEL + 'OEFF - Not included' +'DEFF - Not included' + 'OMIT_PER'+'1.0 days'+'2.0 days'+'3.0 days'+'4.0 days'+'[M1 ~ 'T_REST_1.0 days' + 'T_REST_2.0 days' + 'T_REST_3.0 days' + 'T_REST_4.0 days']'+'[M2 ~ 'T_REST_1.0 days' + 'T_REST_2.0 days' + 'T_REST_3.0 days' + 'T_REST_4.0 days']'+'[M3 ~ 'T_REST_1.0 days' + 'T_REST_2.0 days' + 'T_REST_3.0 days' + 'T_REST_4.0 days']'", t_df
#   ).fit()
#iv_reg.summary

#+ exper + black + south + married + smsa + 


In [114]:
# 
m1_model = sm.OLS(y_m1 , x)
m2_model = sm.OLS(y_m2 , x)
m3_model = sm.OLS(y_m3 , x)


In [115]:
m1_results = m1_model.fit()
m2_results = m2_model.fit()
m3_results = m3_model.fit()

In [121]:
np.array(m1_results.predict())

array([-5.62699532e-15, -5.62699532e-15, -5.62699532e-15, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [116]:
t_df['Pred_M1'] = m1_results.predict
t_df['Pred_M2'] = m2_results.predict
t_df['Pred_M3'] = m3_results.predict

In [118]:
t_df[['Pred_M1']]

Unnamed: 0,Pred_M1
0,<bound method Results.predict of <statsmodels....
1,<bound method Results.predict of <statsmodels....
2,<bound method Results.predict of <statsmodels....
3,<bound method Results.predict of <statsmodels....
4,<bound method Results.predict of <statsmodels....
...,...
140364,<bound method Results.predict of <statsmodels....
140365,<bound method Results.predict of <statsmodels....
140366,<bound method Results.predict of <statsmodels....
140367,<bound method Results.predict of <statsmodels....


In [88]:
y = np.array( t_df[['PER_DIFF']] )
x = np.array(
   t_df[[ 
      'H', 'TRAVEL', 'OEFF - Not included','DEFF - Not included', 'OMIT_PER', '1.0 days', '2.0 days', '3.0 days', '4.0 days', 'Pred_M1', 'Pred_M2', 'Pred_M3'
   ]]
)

In [93]:
y

array([[ -9.36991762],
       [ -1.08424995],
       [ 11.34427899],
       ...,
       [ -4.26208511],
       [-10.35267142],
       [  0.64030858]])

In [91]:
model = sm.OLS(y, x)
results = model.fit()

results

TypeError: '>=' not supported between instances of 'method' and 'method'