# Linear Regression to get Relative Metric Weights


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import altair as alt
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

Read in game data

In [2]:
raw_data = pd.read_csv('../data/Stage2DataFiles/RegularSeasonDetailedResults.csv')


Let's make a copy of the raw data so we can always come back and rerun from here

In [3]:
df = raw_data.copy()
print(f"df columns: {df.columns}")
print(f"df shape: {df.shape}")

df columns: Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')
df shape: (87504, 34)


We have 87504 games of detailed game data. Let's rename some columns to match other-used naming conventions

In [4]:
df = df.rename(columns={
    'GameDate': 'GameDate',
    'NumOT': 'GameOT',
    'WTeamID': 'TmID',
    'WScore': 'TmPF',
    'WFGM': 'TmFGM',
    'WFGA': 'TmFGA',
    'WFGM2': 'TmFG2M',
    'WFGA2': 'TmFG2A',
    'WFGM3': 'TmFG3M',
    'WFGA3': 'TmFG3A',
    'WFTM': 'TmFTM',
    'WFTA': 'TmFTA',
    'WOR': 'TmORB',
    'WDR': 'TmDRB',
    'WTRB': 'TmTRB',
    'WAst': 'TmAst',
    'WStl': 'TmStl',
    'WBlk': 'TmBlk',
    'WTO': 'TmTO',
    'WPF': 'TmFoul',
    'WLoc': 'TmLoc',
    'LTeamID': 'OppID',
    'LScore': 'OppPF',
    'LFGM': 'OppFGM',
    'LFGA': 'OppFGA',
    'LFGM2': 'OppFG2M',
    'LFGA2': 'OppFG2A',
    'LFGM3': 'OppFG3M',
    'LFGA3': 'OppFG3A',
    'LFTM': 'OppFTM',
    'LFTA': 'OppFTA',
    'LOR': 'OppORB',
    'LDR': 'OppDRB',
    'LTRB': 'OppTRB',
    'LAst': 'OppAst',
    'LStl': 'OppStl',
    'LBlk': 'OppBlk',
    'LTO': 'OppTO',
    'LPF': 'OppFoul',
    'LLoc': 'OppLoc'
})
print(f"df columns: {df.columns}")
print(f"df shape: {df.shape}")


df columns: Index(['Season', 'DayNum', 'TmID', 'TmPF', 'OppID', 'OppPF', 'TmLoc', 'GameOT',
       'TmFGM', 'TmFGA', 'TmFG3M', 'TmFG3A', 'TmFTM', 'TmFTA', 'TmORB',
       'TmDRB', 'TmAst', 'TmTO', 'TmStl', 'TmBlk', 'TmFoul', 'OppFGM',
       'OppFGA', 'OppFG3M', 'OppFG3A', 'OppFTM', 'OppFTA', 'OppORB', 'OppDRB',
       'OppAst', 'OppTO', 'OppStl', 'OppBlk', 'OppFoul'],
      dtype='object')
df shape: (87504, 34)


Let's copy the dataframe and get the loser's perspective

In [5]:
# Copy, rename, and append the other half of the games to rsg_prev
ldf = df.copy()
newnames = pd.DataFrame(list(ldf),columns = ['OldName'])
newnames['NewName'] = newnames['OldName']
newnames.loc[newnames['OldName'].str[0:3] == 'Opp','NewName'] = 'Tm' + newnames['OldName'].str[3:]
newnames.loc[newnames['OldName'].str[0:2] == 'Tm','NewName'] = 'Opp' + newnames['OldName'].str[2:]
newnames = newnames.set_index('OldName')['NewName']
ldf = ldf.rename(columns = newnames)
ldf['TmLoc'] = 'N'
ldf.loc[ldf['OppLoc'] == 'H', 'TmLoc'] = 'A'
ldf.loc[ldf['OppLoc'] == 'A', 'TmLoc'] = 'H'
del ldf['OppLoc']
df = df.append(ldf)
del ldf, newnames
print(f"df columns: {df.columns}")
print(f"df shape: {df.shape}")

df columns: Index(['DayNum', 'GameOT', 'OppAst', 'OppBlk', 'OppDRB', 'OppFG3A', 'OppFG3M',
       'OppFGA', 'OppFGM', 'OppFTA', 'OppFTM', 'OppFoul', 'OppID', 'OppORB',
       'OppPF', 'OppStl', 'OppTO', 'Season', 'TmAst', 'TmBlk', 'TmDRB',
       'TmFG3A', 'TmFG3M', 'TmFGA', 'TmFGM', 'TmFTA', 'TmFTM', 'TmFoul',
       'TmID', 'TmLoc', 'TmORB', 'TmPF', 'TmStl', 'TmTO'],
      dtype='object')
df shape: (175008, 34)


Now to add the results fields and other fields, and remove all Opp columns since we are only considering a team's stats when looking at player stat contributions. Also remove non-used fields

In [6]:
# df['TmFG2A'] = df['TmFGA'] - df['TmFG3A']
# df['TmFG2M'] = df['TmFGM'] - df['TmFG3M']
# df['TmTRB'] = df['TmORB'] + df['TmDRB']
# df['TmFGPct'] = df['TmFGM'] / df['TmFGA']
# df['TmFG3Pct'] = df['TmFG3M'] / df['TmFG3A']
# df['TmFG2Pct'] = df['TmFG2M'] / df['TmFG2A']
# df['TmFTPct'] = df['TmFTM'] / df['TmFTA']

# Calculate game margin
df['TmMargin'] = df['TmPF'] - df['OppPF']

for col in df.columns:
    if col[0:3] == 'Opp':
        del df[col]

del df['DayNum'], df['Season'], df['GameOT'], df['TmID'], df['TmLoc']

print(f"df columns: {df.columns}")
print(f"df shape: {df.shape}")

df columns: Index(['TmAst', 'TmBlk', 'TmDRB', 'TmFG3A', 'TmFG3M', 'TmFGA', 'TmFGM',
       'TmFTA', 'TmFTM', 'TmFoul', 'TmORB', 'TmPF', 'TmStl', 'TmTO',
       'TmMargin'],
      dtype='object')
df shape: (175008, 15)


In [7]:
df[['TmMargin']].describe()

Unnamed: 0,TmMargin
count,175008.0
mean,0.0
std,15.112976
min,-75.0
25%,-10.0
50%,0.0
75%,10.0
max,75.0


As expected, we have a centered dataframe around 0, with matching quantiles and min/max

Let's build a function that will take any number of predictors and make a linear regression from it, and output some key metrics

In [8]:
def run_regression(data, predictor_cols, y_col = 'TmMargin'):
    X = df[predictor_cols]
    Y = df[[y_col]]
    # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=92)
    regressor = ElasticNetCV(
        random_state=92,
        cv=10,
        l1_ratio=[.01, .05, .1, .2, .3, .4, .5, .7, .9, .95, .99, 1],
        fit_intercept=True,
        normalize=True,
        verbose=True,
        n_jobs=-1,
    )  
    regressor.fit(X, Y) #training the algorithm 

    print(f"alpha_: {regressor.alpha_}")
    print(f"intercept_: {regressor.intercept_}")
    print(f"score: {regressor.score(X,Y)}")
    print(f"l1_ratio_: {regressor.l1_ratio_}")
    print("COEFFICIENTS")
    for a, b in zip(predictor_cols,regressor.coef_):
        print(f"{a}: {b}")

    return regressor

    # y_pred = regressor.predict(X_test)
    # compared_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    # compared_df['Error_Abs'] = abs(compared_df['Actual'] - compared_df['Predicted'])
    # compared_df['Correct_Winner'] = compared_df['Actual']*compared_df['Predicted'] > 0

    # print(f"With predictors: {predictor_columns}")
    # print(compared_df.head(),'\n')
    # print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
    # print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
    # print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    # print('R^2:', metrics.r2_score(y_test, y_pred))
    # print(f"Correct winner rate: {sum(compared_df['Correct_Winner'])/len(compared_df)}")
    # print(f"Coefficients:")
    # for a, b in zip(predictor_columns,regressor.coef_):
    #     print(f"{a}: {b}")
    # print('\n')
    
    # return compared_df

A quick test run of our function

In [9]:
out = run_regression(df, ['TmPF'])


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

Let's expand to the remaining columns in the dataframe and see what we get!

In [10]:
out = run_regression(df,[x for x in df.columns if x != 'TmMargin'])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
.......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................