In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from keras import layers, callbacks, losses, regularizers, Input, Model
from keras.initializers.initializers_v2 import LecunNormal
import random
import os
import matplotlib.pyplot as plt
from keras.optimizer_v2.adam import Adam # Machine learning

# Set various seeds to 0 to get reproducible results.
os.environ['PYTHONHASHSEED']= str(0)
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

start_year = 1985
max_days = 155
sex = "men"

games1 = pd.read_csv("../input/{}s-march-mania-2022/{}DataFiles_Stage2/{}RegularSeasonCompactResults.csv".format(sex, sex[0].upper(), sex[0].upper()))
games2 = pd.read_csv("../input/{}s-march-mania-2022/{}DataFiles_Stage2/{}NCAATourneyCompactResults.csv".format(sex, sex[0].upper(), sex[0].upper()))

games1_np = games1.to_numpy()
games2_np = games2.to_numpy()

In [None]:
allTeams = np.unique(np.concatenate([games1['WTeamID'], games1['LTeamID'], games2['WTeamID'], games2['LTeamID']]))

tcount = len(allTeams)

tidx = np.zeros(np.max(allTeams) + 1, dtype=int) - 1
tidx[allTeams] = np.arange(tcount)

games_np = np.zeros((0, games1_np.shape[1]))

for season in range(start_year, 2023):
    games_np = np.concatenate([games_np, games1_np[games1_np[:, 0].astype(int) == season], games2_np[games2_np[:, 0] == season]])

In [None]:
def elo_E(a, b):
    return 1/(1 + 10**((b - a)/400))

def elo_delta(expected, result, k):
    return k * (result-expected)

def elo_update(a, b, result, k):
    delta_a = elo_delta(elo_E(a, b), result, k)
    delta_b = elo_delta(elo_E(b, a), 1 - result, k)
    
    return delta_a, delta_b

In [None]:
data_x = []
data_xh1 = []
data_xh2 = []
data_xe = []
data_y = []

ks = [10, 15, 20, 30, 50]

loc_map = {'H' : 1, 'N' : 0, 'A' : -1}

hist_elos = np.empty((38, max_days, tcount, len(ks)))
hist_score_elos = np.empty((38, max_days, tcount, len(ks)))
hist_total = np.zeros((38, max_days, tcount))
hist_count = np.zeros((38, max_days, tcount))

team_elos = np.zeros((tcount, len(ks))) + 1500
team_score_elos = np.zeros((tcount, len(ks))) + 1500
team_total = np.zeros(tcount)

seq_len = 10

for i in range(len(games_np)):
    season = games_np[i, 0] - start_year
    day = games_np[i, 1]

    t1 = tidx[games_np[i, 2]]
    t2 = tidx[games_np[i, 4]]

    t1_score = games_np[i, 3]
    t2_score = games_np[i, 5]

    total_score = t1_score + t2_score
    
    t1_delta = np.zeros(len(ks))
    t2_delta = np.zeros(len(ks))
    t1_score_delta = np.zeros(len(ks))
    t2_score_delta = np.zeros(len(ks))

    for m, k in enumerate(ks):
        t1_delta[m], t2_delta[m] = elo_update(team_elos[t1, m], team_elos[t2, m], 1, k)
        t1_score_delta[m], t2_score_delta[m] = elo_update(team_score_elos[t1, m], team_score_elos[t2, m], t1_score / total_score, k)
        
    loc = loc_map[games_np[i, 6]]
    
    hist_30_t1       = hist_elos[season, max(day-30, 0) : day, t1]
    hist_score_30_t1 = hist_score_elos[season, max(day-30, 0) : day, t1]
    hist_count_30_t1 = hist_count[season, max(day-30, 0) : day, t1]
    
    hist_30_t2       = hist_elos[season, max(day-30, 0) : day, t2]
    hist_score_30_t2 = hist_score_elos[season, max(day-30, 0) : day, t2]
    hist_count_30_t2 = hist_count[season, max(day-30, 0) : day, t2]

    seq_t1 = np.zeros((seq_len, len(ks)*2))
    seq_t2 = np.zeros((seq_len, len(ks)*2))
    
    mean_30_t1 = np.mean(hist_30_t1[hist_count_30_t1 > 0], axis = 0)
    mean_score_30_t1 = np.mean(hist_score_30_t1[hist_count_30_t1 > 0], axis = 0)
    mean_30_t2 = np.mean(hist_30_t2[hist_count_30_t2 > 0], axis = 0)
    mean_score_30_t2 = np.mean(hist_score_30_t2[hist_count_30_t2 > 0], axis = 0)

    data_t1 = None
    data_t2 = None
    
    if np.sum(hist_count_30_t1) > 0:
        data_t1 = np.concatenate([team_elos[t1], team_score_elos[t1], mean_30_t1, mean_score_30_t1])
    else:
        data_t1 = np.concatenate([team_elos[t1], team_score_elos[t1], team_elos[t1], team_score_elos[t1]])
        
    if np.sum(hist_count_30_t2) > 0:
        data_t2 = np.concatenate([team_elos[t2], team_score_elos[t2], mean_30_t2, mean_score_30_t2])
    else:
        data_t2 = np.concatenate([team_elos[t2], team_score_elos[t2], team_elos[t2], team_score_elos[t2]])
        
    diff_t1 = np.diff(np.concatenate([hist_30_t1[hist_count_30_t1 > 0], hist_score_30_t1[hist_count_30_t1 > 0]], axis=1), axis=0)
    diff_t2 = np.diff(np.concatenate([hist_30_t2[hist_count_30_t2 > 0], hist_score_30_t2[hist_count_30_t2 > 0]], axis=1), axis=0)

    diff_t1 = diff_t1[-seq_len:]
    diff_t2 = diff_t2[-seq_len:]

    if len(diff_t1) > 0:
        seq_t1[-len(diff_t1):] = diff_t1
        
    if len(diff_t2) > 0:
        seq_t2[-len(diff_t2):] = diff_t2

    is_tournament = 1 if day > 132 else 0
        
    if np.sum(team_elos[t1] + team_score_elos[t1]) > np.sum(team_elos[t2] + team_score_elos[t2]):
        data_x.append(np.concatenate([data_t1, data_t2]))
        data_xh1.append(seq_t1)    
        data_xh2.append(seq_t2)
        data_xe.append([np.log(team_total[t1] + 1) / 6, np.log(team_total[t2] + 1) / 6, season / 20, day/max_days, is_tournament])
        data_y.append([1, t1_score, t2_score])
    else:
        data_x.append(np.concatenate([data_t2, data_t1]))
        data_xh1.append(seq_t2)
        data_xh2.append(seq_t1)
        data_xe.append([np.log(team_total[t2] + 1) / 6, np.log(team_total[t1] + 1) / 6, season / 20, day/max_days, is_tournament])
        data_y.append([0, t2_score, t1_score])
        
    team_elos[t1] += t1_delta
    team_score_elos[t1] += t1_score_delta
    
    team_elos[t2] += t2_delta
    team_score_elos[t2] += t2_score_delta
    
    team_total[t1] += 1
    team_total[t2] += 1

    hist_elos[season, day, t1] = team_elos[t1].copy()
    hist_score_elos[season, day, t1] = team_score_elos[t1].copy()
    hist_total[season, day, t1] = team_total[t1]
    hist_count[season, day, t1] += 1
    
    hist_elos[season, day, t2] = team_elos[t2].copy()
    hist_score_elos[season, day, t2] = team_score_elos[t2].copy()
    hist_total[season, day, t2] = team_total[t2]
    hist_count[season, day, t2] += 1
    
data_x = np.array(data_x)
data_xh1 = np.array(data_xh1)
data_xh2 = np.array(data_xh2)

data_xe = np.array(data_xe)
data_y = np.array(data_y)

In [None]:
data_x_norm = data_x.copy()
data_x_norm[:, len(ks)*4:] = data_x_norm[:, :len(ks)*4] - data_x_norm[:, len(ks)*4:]

data_xh1_norm = data_xh1.copy()
data_xh2_norm = data_xh2.copy()

x_mean = np.mean(data_x_norm, axis=0)
x_std = np.std(data_x_norm, axis=0)

xh_std = np.std(np.concatenate([data_xh1, data_xh2]), axis=(0, 1))

data_x_norm -= x_mean
data_x_norm /= x_std

data_xh1_norm /= xh_std
data_xh2_norm /= xh_std

In [None]:
plt.hist(np.ravel(data_x_norm), 100)
plt.show()
plt.hist(np.ravel(data_xh1_norm), 100)
plt.show()
plt.hist(np.ravel(data_xh2_norm), 100)
plt.show()
plt.hist(np.ravel(data_xe), 100)
plt.show()
print(data_xh1[-10:])


In [None]:
def get_model(seed=0):
    tf.random.set_seed(seed)
    
    input_x = keras.layers.Input(len(ks)*8)
    input_xh1 = keras.layers.Input((seq_len, len(ks)*2))
    input_xh2 = keras.layers.Input((seq_len, len(ks)*2))
    input_xe = keras.layers.Input(5)

    
    lstm = keras.layers.LSTM(64)
    features = keras.layers.Concatenate()([input_x, lstm(input_xh1), lstm(input_xh2), input_xe])
    
    dense = keras.layers.Dense(256, activation='elu')(features)
    dense = keras.layers.Dense(64, activation='elu')(dense)
    output = keras.layers.Dense(1, activation='sigmoid')(dense)

    model = keras.Model([input_x, input_xh1, input_xh2, input_xe], output)

    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=1e-4))
    
    return model

np.random.seed(0)

order = np.arange(len(data_x_norm))
np.random.shuffle(order)

data_x_fit = data_x_norm[order]
data_xh1_fit = data_xh1_norm[order]
data_xh2_fit = data_xh2_norm[order]
data_xe_fit = data_xe[order]
data_y_fit = data_y[order]

model = get_model()

early_stop = callbacks.EarlyStopping(
             monitor='val_loss',
             patience=20,
             mode='min',
             restore_best_weights=True)

model.fit(x=[data_x_fit, data_xh1_fit, data_xh2_fit, data_xe_fit], 
          y=data_y_fit[:, 0],
          validation_split=0.1,
          epochs=200,
          batch_size=1024,
          verbose=1,
          callbacks=early_stop)

In [None]:
submission = pd.read_csv("../input/{}s-march-mania-2022/{}DataFiles_Stage2/{}SampleSubmissionStage2.csv".format(sex, sex[0].upper(), sex[0].upper()))

ids = submission['ID']

preds = np.zeros(len(ids))

test_x = np.zeros((len(preds), len(ks)*8))
test_xh1 = np.zeros((len(preds), seq_len, len(ks)*2))
test_xh2 = np.zeros((len(preds), seq_len, len(ks)*2))
test_xe = np.zeros((len(preds), 5))

switched = np.zeros(len(preds))
for i, v in enumerate(ids):
    ids_split = v.split('_')

    season = int(ids_split[0]) - start_year
    t1 = tidx[int(ids_split[1])]
    t2 = tidx[int(ids_split[2])]

    # make t1 the lower id
    if t2 < t1:
        tmp = t1
        t1 = t2
        t2 = tmp
    
    lg1 = 132
    lg2 = 132
    
    while hist_count[season, lg1, t1] == 0:
        lg1 -= 1
        
    while hist_count[season, lg2, t2] == 0:
        lg2 -= 1
    
    elo_sum_t1 = np.sum(hist_elos[season, lg1, t1] + hist_score_elos[season, lg1, t1])
    elo_sum_t2 = np.sum(hist_elos[season, lg2, t2] + hist_score_elos[season, lg2, t2])
    
    if elo_sum_t1 < elo_sum_t2:
        switched[i] = 1

        tmp = t1
        t1 = t2
        t2 = tmp  
    
        tmp = lg1
        lg1 = lg2
        lg2 = tmp

    hist_30_t1       = hist_elos[season, max(day-30, 0) : day, t1]
    hist_score_30_t1 = hist_score_elos[season, max(day-30, 0) : day, t1]
    hist_count_30_t1 = hist_count[season, max(day-30, 0) : day, t1]
    hist_30_t2       = hist_elos[season, max(day-30, 0) : day, t2]
    hist_score_30_t2 = hist_score_elos[season, max(day-30, 0) : day, t2]
    hist_count_30_t2 = hist_count[season, max(day-30, 0) : day, t2]
    
    seq_t1 = np.zeros((seq_len, len(ks)*2))
    seq_t2 = np.zeros((seq_len, len(ks)*2))
    
    mean_30_t1 = np.mean(hist_30_t1[hist_count_30_t1 > 0], axis = 0)
    mean_score_30_t1 = np.mean(hist_score_30_t1[hist_count_30_t1 > 0], axis = 0)
    mean_30_t2 = np.mean(hist_30_t2[hist_count_30_t2 > 0], axis = 0)
    mean_score_30_t2 = np.mean(hist_score_30_t2[hist_count_30_t2 > 0], axis = 0)

    data_t1 = None
    data_t2 = None
    
    diff_t1 = np.diff(np.concatenate([hist_30_t1[hist_count_30_t1 > 0], hist_score_30_t1[hist_count_30_t1 > 0]], axis=1), axis=0)
    diff_t2 = np.diff(np.concatenate([hist_30_t2[hist_count_30_t2 > 0], hist_score_30_t2[hist_count_30_t2 > 0]], axis=1), axis=0)

    diff_t1 = diff_t1[-seq_len:]
    diff_t2 = diff_t2[-seq_len:]

    if len(diff_t1) > 0:
        seq_t1[-len(diff_t1):] = diff_t1
        
    if len(diff_t2) > 0:
        seq_t2[-len(diff_t2):] = diff_t2
    
    if np.sum(hist_count_30_t1) > 0:
        data_t1 = np.concatenate([hist_elos[season, lg1, t1], hist_score_elos[season, lg1, t1], mean_30_t1, mean_score_30_t1])
    else:
        data_t1 = np.concatenate([hist_elos[season, lg1, t1], hist_score_elos[season, lg1, t1], hist_elos[season, lg1, t1], hist_score_elos[season, lg1, t1]])
        
    if np.sum(hist_count_30_t2) > 0:
        data_t2 = np.concatenate([hist_elos[season, lg2, t2], hist_score_elos[season, lg2, t2], mean_30_t2, mean_score_30_t2])
    else:
        data_t2 = np.concatenate([hist_elos[season, lg2, t2], hist_score_elos[season, lg2, t2], hist_elos[season, lg2, t2], hist_score_elos[season, lg2, t2]])
        
    test_x[i] = np.concatenate([data_t1, data_t2])
    test_xh1[i] = seq_t1
    test_xh2[i] = seq_t2
    test_xe[i] = [np.log(hist_total[season, lg1, t1] + 1) / 6, np.log(hist_total[season, lg2, t2] + 1) / 6, season / 20, 134 / max_days, 1]

test_x[:, len(ks)*4:] = test_x[:, :len(ks)*4] - test_x[:, len(ks)*4:]
test_x -= x_mean
test_x /= x_std

test_xh1 /= xh_std
test_xh2 /= xh_std

preds = model.predict([test_x, test_xh1, test_xh2, test_xe], batch_size=2048, verbose=1)

preds[switched == 1] = 1 - preds[switched == 1]


submission['Pred'] = preds

submission.to_csv('submission.csv', index = False)
    