In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [2]:
def brier_score(pred_prob, ytrue):
  return np.mean((pred_prob - ytrue)**2)

In [21]:
# Read in data and clean it up for testing (i.e. use season averages)
DATA_PATH = 'drive/MyDrive/march_madness_2023/march-machine-learning-mania-2023/'

df = pd.read_csv(DATA_PATH + 'Mfinal_data_2003-2022.csv')

print(df.columns)
print(df.shape)
df.head()

Index(['Season', 'TeamIDA', 'TeamIDB', 'SeedA', 'SeedB', 'OffEffA', 'DefEffA',
       'EFTA', 'OffEff14A', 'DefEff14A', 'EFT14A', 'WinRatioA', 'GapAvgA',
       'WinRatio14A', 'GapAvg14A', 'AvgRankA', 'OffEffB', 'DefEffB', 'EFTB',
       'OffEff14B', 'DefEff14B', 'EFT14B', 'WinRatioB', 'GapAvgB',
       'WinRatio14B', 'GapAvg14B', 'AvgRankB', 'ScoreDiff', 'WinA'],
      dtype='object')
(2496, 29)


Unnamed: 0,Season,TeamIDA,TeamIDB,SeedA,SeedB,OffEffA,DefEffA,EFTA,OffEff14A,DefEff14A,...,OffEff14B,DefEff14B,EFT14B,WinRatioB,GapAvgB,WinRatio14B,GapAvg14B,AvgRankB,ScoreDiff,WinA
0,2003,1421,1411,16,16,105.315164,115.456256,0.4898,111.471468,103.355312,...,110.312541,102.655767,0.534748,0.6,1.966667,0.833333,5.333333,259.4,8,1
1,2003,1112,1436,1,16,115.232944,95.117513,0.517632,108.373796,101.570461,...,113.140017,94.689354,0.528786,0.655172,4.655172,1.0,11.25,159.6,29,1
2,2003,1113,1272,10,7,113.782349,103.406731,0.517334,116.514025,111.310574,...,114.772311,103.328175,0.520348,0.793103,8.689655,0.75,7.25,21.8,13,1
3,2003,1141,1166,11,6,114.122499,105.421379,0.572835,125.801372,100.232903,...,111.983454,93.372035,0.516155,0.878788,14.909091,1.0,13.0,23.0,6,1
4,2003,1143,1301,8,9,109.611481,102.725168,0.524098,106.408611,105.008149,...,115.09971,115.859787,0.551947,0.6,4.4,0.5,-0.5,45.0,2,1


In [30]:
def run_log_reg(X, Y):
  # Split into training and testing
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

  # Use standard scaler on features
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  model = LogisticRegression()
  model.fit(X_train_scaled, Y_train)
  print('Training accuracy: {:.4f}'.format(model.score(X_train_scaled, Y_train)))
  print('Testing accuracy: {:.4f}'.format(model.score(X_test_scaled, Y_test)))

  train_prob = model.predict_proba(X_train_scaled)
  test_prob = model.predict_proba(X_test_scaled)

  print('Training Brier Score: {:.4f}'.format(brier_score(train_prob[:,0], Y_train)))
  print('Testing Brier Score: {:.4f}'.format(brier_score(test_prob[:,0], Y_test)))

  return model

### Use Logistic Regression to test which features to use

#### Base Case with all features

In [32]:
# Set up X and Y
to_drop = ['Season', 'TeamIDA', 'TeamIDB', 'ScoreDiff', 'WinA']
X = df.drop(to_drop, axis=1)
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.7104
Testing accuracy: 0.7023
Training Brier Score: 0.4422
Testing Brier Score: 0.4517


#### No last 14 days features

In [33]:
# Set up X and Y
to_drop = [col for col in df.columns if '14' in col]
to_drop += ['Season', 'TeamIDA', 'TeamIDB', 'ScoreDiff', 'WinA']
X = df.drop(to_drop, axis=1)
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.7104
Testing accuracy: 0.7089
Training Brier Score: 0.4507
Testing Brier Score: 0.4385


#### No season averages, only last 14 days

In [34]:
# Set up X and Y
to_drop = [col for col in df.columns if '14' not in col]
[to_drop.remove(col) for col in ['SeedA', 'SeedB', 'AvgRankA', 'AvgRankB']]
X = df.drop(to_drop, axis=1)
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.7069
Testing accuracy: 0.6969
Training Brier Score: 0.4377
Testing Brier Score: 0.4328


#### Differences instead of both values for A and B

In [41]:
# Set up X and Y
to_drop = [col for col in df.columns if '14' in col]
to_drop += ['Season', 'TeamIDA', 'TeamIDB', 'ScoreDiff', 'WinA']
X = df.drop(to_drop, axis=1)
Y = df['WinA']

cols_to_diff = [
    'Seed', 'WinRatio', 'GapAvg', 'OffEff', 'DefEff', 'EFT', 'AvgRank'
]

for col in cols_to_diff:
    X[col + 'Diff'] = df[col + 'A'] - df[col + 'B']
    X.drop([col+'A', col+'B'], axis=1, inplace=True)

model = run_log_reg(X, Y)

Training accuracy: 0.7201
Testing accuracy: 0.6889
Training Brier Score: 0.4549
Testing Brier Score: 0.4327


#### Seeds only

In [43]:
features = ['SeedA', 'SeedB']
X = df[features]
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.6995
Testing accuracy: 0.6969
Training Brier Score: 0.4297
Testing Brier Score: 0.4151


#### Average Massey Rankings only

In [44]:
features = ['AvgRankA', 'AvgRankB']
X = df[features]
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.7132
Testing accuracy: 0.7143
Training Brier Score: 0.4213
Testing Brier Score: 0.4288


#### GapAvg only


In [45]:
features = ['GapAvgA', 'GapAvgB']
X = df[features]
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.6440
Testing accuracy: 0.6756
Training Brier Score: 0.3639
Testing Brier Score: 0.3784


#### WinRatio only

In [46]:
features = ['WinRatioA', 'WinRatioB']
X = df[features]
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.6188
Testing accuracy: 0.6195
Training Brier Score: 0.3353
Testing Brier Score: 0.3343


#### No Advanced Metrics

In [48]:
features = ['SeedA', 'SeedB', 'AvgRankA', 'AvgRankB', 'GapAvgA', 'GapAvgB', 'WinRatioA', 'WinRatioB']
X = df[features]
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.7178
Testing accuracy: 0.7076
Training Brier Score: 0.4442
Testing Brier Score: 0.4404


#### WinRatio last 14 days only

In [49]:
features = ['WinRatio14A', 'WinRatio14B']
X = df[features]
Y = df['WinA']

model = run_log_reg(X, Y)

Training accuracy: 0.5426
Testing accuracy: 0.5327
Training Brier Score: 0.2544
Testing Brier Score: 0.2517
