In [18]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from matplotlib import colors
from matplotlib.ticker import PercentFormatter
# regression analysis

from stargazer.stargazer import Stargazer
import sys
import os
from pathlib import Path

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utils")

from players_season_builder import *
from modeling import *


For the independent variables, we have:
    Player Quality – Estimated as being the constant in the regression model
    Home Court Advantage – A binary variable that indicates if the player is at home or on the road
    Opp. Quality – The opponent’s quality will be estimated by 2 variables: its offensive and its defensive efficiency (points allowed/scored per 100 possessions)
    Rest Level – To estimate a player’s rest level, we create variables to reflect the activity for the past 3 days. There are two approaches for these variables:
    Minutes – There are 3 variables (non-dummy) that reflect how many minutes the player was on court for each of the past 3 days
    Days-off – The variables reflect if the player entered a game, no matter how much he played (e.g., the dummy “011”, means that the team has played 2 and 3 days ago, and has not played yesterday)

This results in the following regression model (using the “minutes” approach for rest level):

    PER= α + C * H + ( C * OO + C * OD ) + ( C * M1 + C * M2 + C * M3 ) + nu + err

Where:
    PER = Player Efficiency Rating (for the game)
    α = Constant (that can be interpreted as a player’s baseline performance)
    H = Binary variable that is 1 if the player is playing at home, and 0 if he is playing on the road
    OO = Opponent’s offensive efficiency during the season
    OD = Opponent’s team defensive efficiency during the season
    M1/M2/M3 = Number of minutes the player has played 1, 2, and 3 days before the game
    nu = PER for all other games in the season
    


In [2]:
player_path = r"\Users\sebas\Desktop\UChicago - Q6\Sports Analytics\sports_analytics_project\data\BDB_Player.xlsx"
df_player = pd.read_excel(player_path)

In [3]:
df_player.columns = ['DATASET', 'DATE', 'PLAYER FULL NAME', 'POSITION', 'OWN TEAM',
       'OPP TEAM', 'VENUE', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR',
       'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'PER', 'DATE-DIFF',
       'RR VAL', 'RR SERIES', 'S_PER', 'I_PER', 'SAME_CITY', 'TRAVEL',
       '1_days', '10_days', '11_days', '12_days', '13_days',
       '14+_days', '14_days', '2_days', '3_days', '4_days', '5_days',
       '6_days', '7_days', '8_days', '9_days', 'Season_Start',
       'H', 'R', 'H-M1', 'H-M2', 'H-M3', 'R-M1', 'R-M2', 'R-M3', 'M1', 'M2',
       'M3', 'S_OEFF', 'S_DEFF', 'I_OEFF', 'I_DEFF','Bubble']


In [4]:
# Remove 'Bad' Seasons

# Remove 2011-2012 Season
df_clean = df_player[df_player['DATASET'] != '2011-2012 Regular Season']
# Remove Bubble Seasons (2019-2020)
df_clean = df_player[df_player['DATASET'] != '2019-2020 Regular Season']
# Create PER Difference Column
df_clean['PER_DIFF'] = df_clean['PER'] - df_clean['I_PER']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['PER_DIFF'] = df_clean['PER'] - df_clean['I_PER']


In [28]:
rhs = [ 'H', 'TRAVEL', 'I_OEFF','I_DEFF','I_PER', 'M1', 'M2', 'M3','1_days', '2_days', '3_days', '4_days']
tbl = ['constant']
tbl = tbl + rhs
res = ols_base(
        data = df_clean, 
        y = 'PER', 
        x = rhs, 
        constant = True)


In [50]:
# Wald Test
# res.wald_test_terms()
hypothesis = 'M1 = 0, M2 = 0, M3 = 0'
res.wald_test(hypothesis)


PatsyError: unrecognized token in constraint
    M1 = 0, M2 = 0, M3 = 0
    ^

In [19]:
formula = "PER ~ 1 + H + TRAVEL + I_OEFF + I_DEFF + 1_days + 2_days + 3_days + 4_days + [M1 ~ ]"

TypeError: object of type 'RegressionResultsWrapper' has no len()