### Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Normalization
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import ReduceLROnPlateau

### Data Input and Processing

In [2]:
# Read in dataframe
df = pd.read_csv('../combined data.csv')
df

Unnamed: 0,game_id,Datetime,Team1 ID,Team2 ID,Team1 Score,Team2 Score,Point Differential,Team1 Home,Team2 Home,Team1 team_id,...,Team2 KenPom Def Eff,Team2 KenPom AdjTempo,Team2 KenPom SOS,Team2 KenPom Off SOS,Team2 KenPom Def SOS,Team2 BT Eff Height,Team2 BT WAB,Team2 BT Power,Team2 BT Experience,Team2 BT Talent
0,514736,11/14/2014 0:00,Alabama A&M_2015,Dayton_2015,52,76,-24,0,1,Alabama A&M_2015,...,95.0,63.4,3.39,105.4,102.0,78.250,0.938,0.798,1.540,46.592
1,514738,11/14/2014 0:00,Alcorn St._2015,California_2015,57,91,-34,0,1,Alcorn St._2015,...,99.0,65.3,5.58,107.8,102.2,80.886,-3.497,0.641,1.637,52.204
2,514740,11/14/2014 0:00,American_2015,Temple_2015,37,40,-3,0,1,American_2015,...,94.2,65.4,2.21,104.8,102.6,80.368,-0.276,0.794,1.941,49.622
3,514742,11/14/2014 0:00,Ball St._2015,Utah_2015,72,90,-18,0,1,Ball St._2015,...,89.2,60.5,8.19,108.8,100.6,81.657,3.720,0.945,1.554,27.543
4,514743,11/14/2014 0:00,Charleston Southern_2015,Mississippi_2015,66,65,1,0,1,Charleston Southern_2015,...,100.6,66.2,7.10,107.3,100.2,80.963,-0.661,0.787,2.280,46.963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52033,5254271,4/2/2024 0:00,Utah_2024,Indiana St._2024,90,100,-10,0,0,Utah_2024,...,103.2,70.9,3.07,107.7,104.7,80.221,1.928,0.852,2.079,0.200
52034,5254269,4/4/2024 0:00,Indiana St._2024,Seton Hall_2024,77,79,-2,0,0,Indiana St._2024,...,98.1,66.8,10.10,111.8,101.7,79.886,0.943,0.856,2.484,62.979
52035,5254134,4/6/2024 0:00,Alabama_2024,Connecticut_2024,72,86,-14,0,0,Alabama_2024,...,91.1,64.6,12.42,113.2,100.8,81.643,11.210,0.980,1.710,58.255
52036,5254135,4/6/2024 0:00,N.C. State_2024,Purdue_2024,50,63,-13,0,0,N.C. State_2024,...,94.6,67.0,14.65,114.4,99.8,83.272,10.620,0.965,1.854,52.251


In [3]:
# Drop metadata
df = df.drop(columns = ['game_id', 'Datetime', 'Team1 ID', 'Team2 ID', 'Team1 Score', 'Team2 Score', 'Team1 team_id', 'Team1 Team1 team_name', 'Team1 year', 'Team2 team_id', 'Team2 team_name', 'Team2 year'])

# Drop NA (some D1->D3 transfers missing kenpom data)
df = df.dropna()
df = df.reset_index(drop=True)

df

Unnamed: 0,Point Differential,Team1 Home,Team2 Home,Team1 Pts/Gm,Team1 Opp. Pts/Gm,Team1 FG%,Team1 Opp. FG%,Team1 2Pt%,Team1 Opp. 2Pt%,Team1 3Pt%,...,Team2 KenPom Def Eff,Team2 KenPom AdjTempo,Team2 KenPom SOS,Team2 KenPom Off SOS,Team2 KenPom Def SOS,Team2 BT Eff Height,Team2 BT WAB,Team2 BT Power,Team2 BT Experience,Team2 BT Talent
0,-24,0,1,61.3,68.7,39.3,45.6,45.4,50.7,29.1,...,95.0,63.4,3.39,105.4,102.0,78.250,0.938,0.798,1.540,46.592
1,-34,0,1,63.4,74.2,38.8,47.2,43.3,51.0,28.4,...,99.0,65.3,5.58,107.8,102.2,80.886,-3.497,0.641,1.637,52.204
2,-3,0,1,58.7,58.6,46.2,45.5,51.7,50.9,37.4,...,94.2,65.4,2.21,104.8,102.6,80.368,-0.276,0.794,1.941,49.622
3,-18,0,1,64.1,69.2,41.6,46.5,47.7,51.8,33.6,...,89.2,60.5,8.19,108.8,100.6,81.657,3.720,0.945,1.554,27.543
4,1,0,1,71.4,70.6,40.6,44.9,46.7,50.5,33.7,...,100.6,66.2,7.10,107.3,100.2,80.963,-0.661,0.787,2.280,46.963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,-10,0,0,78.8,73.2,46.9,42.7,53.6,47.9,37.0,...,103.2,70.9,3.07,107.7,104.7,80.221,1.928,0.852,2.079,0.200
48972,-2,0,0,84.7,73.5,50.4,44.0,62.4,48.8,38.1,...,98.1,66.8,10.10,111.8,101.7,79.886,0.943,0.856,2.484,62.979
48973,-14,0,0,90.1,81.2,47.6,43.7,56.6,50.8,37.3,...,91.1,64.6,12.42,113.2,100.8,81.643,11.210,0.980,1.710,58.255
48974,-13,0,0,75.6,71.8,45.0,43.3,50.2,49.0,34.5,...,94.6,67.0,14.65,114.4,99.8,83.272,10.620,0.965,1.854,52.251


In [4]:
# Split into x & y variables
x = df.drop(columns = 'Point Differential')
y = df['Point Differential']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25)

In [5]:
# Verify x_train looks as expected
x_train.head()

Unnamed: 0,Team1 Home,Team2 Home,Team1 Pts/Gm,Team1 Opp. Pts/Gm,Team1 FG%,Team1 Opp. FG%,Team1 2Pt%,Team1 Opp. 2Pt%,Team1 3Pt%,Team1 Opp. 3Pt%,...,Team2 KenPom Def Eff,Team2 KenPom AdjTempo,Team2 KenPom SOS,Team2 KenPom Off SOS,Team2 KenPom Def SOS,Team2 BT Eff Height,Team2 BT WAB,Team2 BT Power,Team2 BT Experience,Team2 BT Talent
48548,0,1,85.6,79.2,45.8,44.0,51.8,49.5,34.5,32.6,...,107.4,65.7,10.3,112.2,101.9,79.636,-10.622,0.444,1.679,43.652
40621,0,1,63.5,66.0,43.0,44.2,51.2,48.3,31.1,37.4,...,102.6,62.6,5.1,107.5,102.4,81.111,-2.187,0.677,1.798,9.032
6844,0,1,70.7,75.2,41.5,46.1,45.7,51.6,33.8,37.8,...,95.5,67.3,3.51,107.3,103.8,82.216,2.231,0.921,1.788,58.0
29063,0,0,66.5,72.3,40.8,44.9,46.3,50.6,35.2,36.9,...,105.5,67.5,-3.09,102.2,105.3,81.762,-5.213,0.481,2.114,0.2
38275,0,1,68.2,75.6,40.8,44.1,46.5,50.7,32.5,33.6,...,104.6,69.5,9.75,108.8,99.1,80.612,0.355,0.839,2.022,48.548


In [6]:
# Verify y_train looks as expected
y_train.head()

48548    -1
40621   -10
6844    -23
29063     5
38275   -41
Name: Point Differential, dtype: int64

### Model Setup

In [59]:
# Define the model
model = Sequential()
normalizer = Normalization()
normalizer.adapt(x_train) # TODO: Investigate

# Input_shape here is the number of columns in x_train
model.add(normalizer)
model.add(Dropout(0.25))
model.add(Dense(units=128, activation='relu', input_shape=[x_train.shape[1]]))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=1))

model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_13 (Normaliza  (None, 102)              205       
 tion)                                                           
                                                                 
 dropout_13 (Dropout)        (None, 102)               0         
                                                                 
 dense_99 (Dense)            (None, 128)               13184     
                                                                 
 dense_100 (Dense)           (None, 128)               16512     
                                                                 
 dense_101 (Dense)           (None, 128)               16512     
                                                                 
 dense_102 (Dense)           (None, 128)               16512     
                                                     

In [60]:
# Compile the model
model.compile(optimizer=Nadam(learning_rate=1e-2), loss = 'mse')

### Train Model

In [61]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5,min_lr=1e-4)
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs=100, batch_size=1024, callbacks=[reduce_lr])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1d801293820>

In [62]:
corr = 0
wrong = 0
difs = []
for pred, act in zip(model.predict(x_test), y_test):
    print(pred[0], act)
    difs.append(pred - act)
    if (pred < 0 and act < 0 or pred > 0 and act > 0):
        corr += 1
    else:
        wrong += 1

print(corr, wrong, corr/(wrong + corr))


-14.886955 -3
5.561391 -3
-4.12988 -16
2.136126 -2
-7.2516174 6
-14.077793 -4
1.8660848 10
-0.8580051 13
3.4405062 -6
-1.3970271 -17
0.7439619 1
-6.9637547 -13
2.743891 1
1.2735388 13
-18.473673 -14
-6.7043047 -4
13.423483 5
-21.487673 -9
-2.3129478 8
9.082668 14
-6.9493895 -24
7.0799417 11
1.0139165 4
-18.879688 -23
23.802267 3
-6.2971535 -19
1.1792808 -9
12.2115345 19
0.91997445 10
-3.8806744 17
0.53361315 14
-6.8361197 -3
-10.011553 -1
-10.642559 -27
-9.883662 -38
-7.690694 -19
-1.6898627 -1
-5.5633736 -5
-2.1582437 -6
-5.8739185 -4
-17.357382 -14
3.306604 18
9.558567 5
-24.962996 -20
-15.705024 -12
7.861676 6
-4.6147375 7
-7.2626133 -15
1.5420605 9
-7.7201138 -11
18.115997 28
2.7657564 -3
10.597401 14
-7.8110676 -3
-6.9187126 -22
-5.8844433 11
1.2726543 4
-10.4602165 -14
23.941622 32
8.618936 -18
-0.579834 -6
2.351764 -4
0.6152574 -12
9.865191 21
-5.9547243 -18
-8.330141 -15
1.6974074 26
4.8473473 2
-7.2426844 -9
5.9821005 9
-17.426275 -31
-14.821061 -6
1.0436026 1
12.3139305 9
8.1