In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from matplotlib import colors
from matplotlib.ticker import PercentFormatter
# regression analysis


import sys
import os
from pathlib import Path

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utils")

from players_season_builder import *


For the independent variables, we have:
    Player Quality – Estimated as being the constant in the regression model
    Home Court Advantage – A binary variable that indicates if the player is at home or on the road
    Opp. Quality – The opponent’s quality will be estimated by 2 variables: its offensive and its defensive efficiency (points allowed/scored per 100 possessions)
    Rest Level – To estimate a player’s rest level, we create variables to reflect the activity for the past 3 days. There are two approaches for these variables:
    Minutes – There are 3 variables (non-dummy) that reflect how many minutes the player was on court for each of the past 3 days
    Days-off – The variables reflect if the player entered a game, no matter how much he played (e.g., the dummy “011”, means that the team has played 2 and 3 days ago, and has not played yesterday)

This results in the following regression model (using the “minutes” approach for rest level):

    PER= α + C * H + ( C * OO + C * OD ) + ( C * M1 + C * M2 + C * M3 ) + nu + err

Where:
    PER = Player Efficiency Rating (for the game)
    α = Constant (that can be interpreted as a player’s baseline performance)
    H = Binary variable that is 1 if the player is playing at home, and 0 if he is playing on the road
    OO = Opponent’s offensive efficiency during the season
    OD = Opponent’s team defensive efficiency during the season
    M1/M2/M3 = Number of minutes the player has played 1, 2, and 3 days before the game
    nu = PER for all other games in the season
    


In [15]:
player_path = r"\Users\sebas\Desktop\UChicago - Q6\Sports Analytics\sports_analytics_project\data\BDB_Player.xlsx"

df_player = pd.read_excel(player_path)


In [25]:
df_player.columns = ['DATASET', 'DATE', 'PLAYER FULL NAME', 'POSITION', 'OWN TEAM',
       'OPP TEAM', 'VENUE', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'OR',
       'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS', 'PER', 'DATE-DIFF',
       'RR VAL', 'RR SERIES', 'S_PER', 'I_PER', 'SAME CITY', 'TRAVEL',
       '1.0 days', '10.0 days', '11.0 days', '12.0 days', '13.0 days',
       '14+ days', '14.0 days', '2.0 days', '3.0 days', '4.0 days', '5.0 days',
       '6.0 days', '7.0 days', '8.0 days', '9.0 days', 'Season Start days',
       'H', 'R', 'H-M1', 'H-M2', 'H-M3', 'R-M1', 'R-M2', 'R-M3', 'M1', 'M2',
       'M3', 'S_OEFF', 'S_DEFF', 'I_OEFF', 'I_DEFF']
print(df_player.shape)

(173750, 62)


In [31]:


df_filter = df_player[df_player['I_PER'].notnull()]
# Define LHS
y = np.array(df_filter[['PER']])

# Define RHS
# Missing - 'ACCURATE' Travel 
# Missing - OPP Offensive efficiency
# Missing - OPP Defensive efficiency
# Missing - Am I at Home?
# Missing - Minutes played in last 3 days?

x = np.array(
    df_filter[[
       'H', 'I_OEFF','I_DEFF', 'I_PER', 'M1', 'M2', 'M3', '1.0 days', '2.0 days', '3.0 days', '4.0 days', '5.0 days', '6.0 days', '7.0 days']]
)
# '8.0 days', '9.0 days', '10.0 days', '11.0 days', '12.0 days', '13.0 days','14.0 days', '14+ days'
x = sm.add_constant(x)

In [43]:
player_per_model = sm.OLS(y , x)
results = player_per_model.fit()
results.params

array([-1.01617527e+01,  6.03034103e-01, -4.04786768e-02,  1.41621478e-01,
        8.95822986e-01,  5.90245733e-03,  7.80308455e-03,  1.35402987e-03,
       -1.41956283e-01, -2.31273259e-02,  1.17779590e-01,  7.83426624e-02,
        5.50130352e-02, -2.90476557e-01, -2.22152203e-02])

In [51]:
results.summary()
# pd.DataFrame(
#    results.tvalues, results.params, ['Intercept','H', 'I_OEFF','I_DEFF', 'I_PER', 'M1', 'M2', 'M3', '1.0 days', '2.0 days', '3.0 days', '4.0 days', '5.0 days', '6.0 days', '7.0 days'])

0,1,2,3
Dep. Variable:,y,R-squared:,0.2
Model:,OLS,Adj. R-squared:,0.2
Method:,Least Squares,F-statistic:,3086.0
Date:,"Sun, 01 May 2022",Prob (F-statistic):,0.0
Time:,21:58:30,Log-Likelihood:,-575640.0
No. Observations:,173289,AIC:,1151000.0
Df Residuals:,173274,BIC:,1151000.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-10.1618,0.702,-14.475,0.000,-11.538,-8.786
x1,0.6030,0.033,18.425,0.000,0.539,0.667
x2,-0.0405,0.004,-9.022,0.000,-0.049,-0.032
x3,0.1416,0.005,29.905,0.000,0.132,0.151
x4,0.8958,0.004,199.622,0.000,0.887,0.905
x5,0.0059,0.004,1.617,0.106,-0.001,0.013
x6,0.0078,0.003,3.048,0.002,0.003,0.013
x7,0.0014,0.001,1.036,0.300,-0.001,0.004
x8,-0.1420,0.127,-1.120,0.263,-0.390,0.107

0,1,2,3
Omnibus:,4105.58,Durbin-Watson:,2.023
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4737.664
Skew:,0.339,Prob(JB):,0.0
Kurtosis:,3.444,Cond. No.,6730.0
