In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
def brier_score(pred_prob, ytrue):
  return np.mean((pred_prob - ytrue)**2)

### Test how to predict probabilities with regression

Use linear regression to test how to predict

1) Using "probability" target where probability is 1 for winning team and 0 for losing team

2) Predict ScoreDiff so that positive --> A wins and negative --> B wins

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Read in data and clean it up for testing (i.e. use season averages)
DATA_PATH = 'drive/MyDrive/march_madness_2023/march-machine-learning-mania-2023/'

df = pd.read_csv(DATA_PATH + 'Mfinal_data_2003-2022.csv')
to_drop = [col for col in df.columns if '14' in col]
df.drop(to_drop, axis=1, inplace=True)

print(df.shape)
df.head()

(2496, 19)


Unnamed: 0,Season,TeamIDA,TeamIDB,SeedA,SeedB,OffEffA,DefEffA,EFTA,WinRatioA,GapAvgA,AvgRankA,OffEffB,DefEffB,EFTB,WinRatioB,GapAvgB,AvgRankB,ScoreDiff,WinA
0,2003,1421,1411,16,16,105.315164,115.456256,0.4898,0.448276,-7.241379,259.2,107.187241,105.071425,0.503036,0.6,1.966667,259.4,8,1
1,2003,1112,1436,1,16,115.232944,95.117513,0.517632,0.892857,14.964286,2.6,106.911164,99.247639,0.494732,0.655172,4.655172,159.6,29,1
2,2003,1113,1272,10,7,113.782349,103.406731,0.517334,0.62069,6.793103,33.6,109.889666,97.004974,0.498337,0.793103,8.689655,21.8,13,1
3,2003,1141,1166,11,6,114.122499,105.421379,0.572835,0.793103,6.103448,43.4,118.566346,96.739284,0.567455,0.878788,14.909091,23.0,6,1
4,2003,1143,1301,8,9,109.611481,102.725168,0.524098,0.724138,4.724138,36.2,111.669631,105.001178,0.534189,0.6,4.4,45.0,2,1


#### 1) "Probability" regression

In [None]:
# Set up X and Y
to_drop = ['Season', 'TeamIDA', 'TeamIDB', 'ScoreDiff', 'WinA']
X = df.drop(to_drop, axis=1)
Y = df['WinA']
print(X.shape)
print(Y.shape)

# Split into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# Use standard scaler on features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

(2496, 14)
(2496,)


In [None]:
reg = LinearRegression()
reg.fit(X_train_scaled, Y_train)
train_prob = reg.predict(X_train_scaled)
test_prob = reg.predict(X_test_scaled)
print('Training Brier Score: {:.4f}'.format(brier_score(train_prob, Y_train)))
print('Testing Brier Score: {:.4f}'.format(brier_score(test_prob, Y_test)))

Training Brier Score: 0.1890
Testing Brier Score: 0.1871


In [None]:
print("Testing 'probabilities':")
print('\tmean:', test_prob.mean())
print('\tmin:', test_prob.min())
print('\tmax:', test_prob.max())

Testing 'probabilities':
	mean: 0.49856918145363666
	min: -0.1293127434493928
	max: 1.1974845308824646


#### 2) ScoreDiff regression

In [None]:
# Set up X and Y
to_drop = ['Season', 'TeamIDA', 'TeamIDB', 'ScoreDiff', 'WinA']
X = df.drop(to_drop, axis=1)
Y = df['ScoreDiff']
print(X.shape)
print(Y.shape)

# Split into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

# Use standard scaler on features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

(2496, 14)
(2496,)


In [None]:
reg = LinearRegression()
reg.fit(X_train_scaled, Y_train)
train_score_diff = reg.predict(X_train_scaled)
test_score_diff = reg.predict(X_test_scaled)
train_pred = (train_score_diff > 0).astype(int)
test_pred = (test_score_diff > 0).astype(int)
print('Training Brier Score: {:.4f}'.format(brier_score(train_pred, Y_train)))
print('Testing Brier Score: {:.4f}'.format(brier_score(test_pred, Y_test)))

Training Brier Score: 206.7871
Testing Brier Score: 188.4179


In [None]:
print("Testing 'score differential':")
print('\tmean:', test_prob.mean())
print('\tmin:', test_prob.min())
print('\tmax:', test_prob.max())

Testing 'score differential':
	mean: -0.3163282978447495
	min: -36.37099714751128
	max: 35.48953610906112
