In [3]:
import pandas as pd
import polars as pl
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import polars.selectors as cs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

df = pl.read_csv('../input_data/model_data.csv')
print(df.shape)
display(df.head())

(748, 21)


G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,SEED,POSTSEASON_WINS
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
40,33,123.3,94.9,0.9531,52.6,48.1,15.4,18.2,40.7,30.0,32.3,30.4,53.9,44.6,32.7,36.2,71.7,8.6,1,5
40,36,129.1,93.6,0.9758,54.8,47.7,12.4,15.8,32.1,23.7,36.2,22.4,54.8,44.7,36.5,37.5,59.3,11.3,1,5
40,33,114.4,90.4,0.9375,53.9,47.7,14.0,19.5,25.5,24.9,30.7,30.0,54.7,46.8,35.2,33.2,65.9,6.9,3,5
38,31,115.2,85.2,0.9696,53.5,43.0,17.7,22.8,27.4,28.7,32.9,36.6,52.8,41.9,36.5,29.7,67.5,7.0,3,5
39,37,117.8,86.3,0.9728,56.6,41.1,16.2,17.1,30.0,26.2,39.0,26.9,56.3,40.0,38.2,29.0,71.5,7.7,1,5


## Create a bunch of univarirate regression models and see which is best


https://www.datacamp.com/tutorial/sklearn-linear-regression

In [4]:
def linear_reg_r2(df, x_var):

    ## create a pipeline to automatically do a standard scaler and linear regression
    linear_reg_pipe = Pipeline([('scaler', StandardScaler()), ('linreg', LinearRegression())])

    x = df.select(x_var)
    y = df.select('POSTSEASON_WINS')

    ## Standard 80/20 test/train split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=84305)

    ## call the pipeline to get the score
    r2 = linear_reg_pipe.fit(x_train, y_train).score(x_test, y_test)

    return r2

In [5]:
test_cols_r2 = {}
test_cols = df.select(pl.exclude('POSTSEASON_WINS')).columns
for x_var in test_cols:
    r2 = linear_reg_r2(df, x_var)
    test_cols_r2[x_var] = round(r2, 3)

test_cols_r2

{'G': 0.215,
 'W': 0.224,
 'ADJOE': 0.088,
 'ADJDE': 0.154,
 'BARTHAG': 0.151,
 'EFG_O': -0.12,
 'EFG_D': 0.069,
 'TOR': -0.064,
 'TORD': -0.004,
 'ORB': 0.057,
 'DRB': -0.018,
 'FTR': -0.01,
 'FTRD': -0.022,
 '2P_O': -0.118,
 '2P_D': 0.042,
 '3P_O': -0.031,
 '3P_D': 0.03,
 'ADJ_T': -0.003,
 'WAB': 0.222,
 'SEED': 0.195}

Things we learn:
* Top performers: 
    * G: games, positive correlation (makes sense, more games = made it further in conf tourney) 
    * W: wins, makes sense, more regular season wins are better
    * ADJDE: adjusted defense, perhaps defense does win championships?
    * BARTHAG: Power rating - should be a good indicator if its accurate
    * WAB: wins above bubble - makes sense, if you are beating teams that barely missed the tourney you should beat a decent amount that did make it
    * SEED: we already saw top seeds are going to win more

* Note: there might be some collinearity going on when dive deeper - won't good seeds be good at all this other stuff?

* Poor performers: TORD, FTR, ADJ_T, Year
    * TORD: turnover percentage committed (steal rate)
    * FTR: free throw rate - how often you get to the line - officiating can be different
    * ADJ_T: adjusted tempo - different teams have different styles, no style is superior overall

* A lot of the O stats are negatively correlated with wins, thats a bit unexpected

More variables that could be helpful in future analysis now that we are thinking about some of these:
* Win percentage (W/G)
* Strength of Schedule
* Strength of Victory
* Quadrant 1-4 wins and losses (this is used as part of the criteria for picking tournament teams, would be intersting to see if it actually resulted in picking teams that do better in the tournament)

## Multivariate Linear Regression Model

In [6]:
## set up the model and all the variables:

x = df.select(pl.all().exclude('POSTSEASON_WINS'))
y = df.select('POSTSEASON_WINS')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=84305)

scaler = StandardScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)

multi_lin_reg_model = LinearRegression()
multi_lin_reg_model.fit(x_train_s, y_train)

y_pred = multi_lin_reg_model.predict(x_test_s)

In [7]:
def model_coefficients(x, model):

    cols = x.columns
    coefs = model.coef_.flatten().tolist()

    df = pl.DataFrame({'Vars':cols, 'Coef': coefs})

    return df

In [10]:
r2 = r2_score(y_test, y_pred)
intercept = multi_lin_reg_model.intercept_

n= len(x_train)
p= len(x.columns)
adj_r2 = 1- ((1-r2) * (n-1)/(n-p-1))

rmse = mean_squared_error(y_test, y_pred)

print(f"R2: {r2}")
print(f"Adj R2: {adj_r2}")
print(f"Intercept: {intercept}")
print(f"RMSE: {rmse}")
model_coefficients(x, multi_lin_reg_model)

R2: 0.4262298601758958
Adj R2: 0.40634181373485234
Intercept: [0.94481605]
RMSE: 0.9209903275505485


Vars,Coef
str,f64
"""G""",-0.218592
"""W""",0.832233
"""ADJOE""",1.858793
"""ADJDE""",-1.692908
"""BARTHAG""",-1.453127
…,…
"""3P_O""",-0.302339
"""3P_D""",0.382467
"""ADJ_T""",-0.023519
"""WAB""",-0.824328


## Based on these results, we can explain about 42% of the variance in post season games won with the variables used in the model. Not too bad for something that should be difficult to predict.

## The RMSE is about 1, so we will be off by about a game on average which isn't terrible

### We can do one more model using the important features we saw in the univariate example and compare to see if they actually do close to as good

### There is definitely too many variables here and we could do some more for feature importance testing. However, the goal for now is just to set things up. Create some models and use many different techniques, so let's do some other models first

In [11]:
## set up the model and all the variables:
x = df.select(['G', 'W', 'ADJDE', 'BARTHAG', 'WAB', 'SEED'])
y = df.select('POSTSEASON_WINS')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=84305)

scaler = StandardScaler()
x_train_s = scaler.fit_transform(x_train)
x_test_s = scaler.transform(x_test)

multi_lin_reg_model = LinearRegression()
multi_lin_reg_model.fit(x_train_s, y_train)

y_pred = multi_lin_reg_model.predict(x_test_s)

In [12]:
r2 = r2_score(y_test, y_pred)
intercept = multi_lin_reg_model.intercept_

n= len(x_train)
p= len(x.columns)
adj_r2 = 1- ((1-r2) * (n-1)/(n-p-1))

print(f"R2: {r2}")
print(f"Adj R2: {adj_r2}")
print(f"Intercept: {intercept}")
model_coefficients(x, multi_lin_reg_model)

R2: 0.2965820405822863
Adj R2: 0.28944074150190346
Intercept: [0.94481605]


Vars,Coef
str,f64
"""G""",0.078634
"""W""",0.445253
"""ADJDE""",-0.061
"""BARTHAG""",0.065414
"""WAB""",-0.388223
"""SEED""",-0.760608


In [None]:
## This model does not do as well