# Tournament Simulation

- Robert Shaw
- Data Driven March Madness

In this file, we test code written in march_madness_classes. The code allows us to train a head to head model predicting the the probability of team 1 beating team 2 in a college basketball games based on a logistic model of the probability. We run the tournament n times, counting the expected score of each team over the iterations. We then take the total score over the n iterations and predict head to head matchups as arg_max(points_1, points_2).

In [9]:
import march_madness_classes as mmc
import pandas as pd

In [18]:
# read in the dataset
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")
games = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")

seeds_arr = mmc.filter_into_seasons(seeds)
slots_arr = mmc.filter_into_seasons(slots)
games_arr = mmc.filter_into_seasons(resul)

In [19]:
# extract predictors
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
consistency = pd.read_csv("datasets/our_data/consistency", index_col=0)
dominance = pd.read_csv("datasets/our_data/dominance", index_col=0)
past_resul = pd.read_csv("datasets/our_data/past_results", index_col=0)
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)
tough_wins = pd.read_csv("datasets/our_data/tough_wins", index_col=0)
close_games= pd.read_csv("datasets/our_data/close_games", index_col=0)
close_wins = pd.read_csv("datasets/our_data/close_wins",index_col=0)
close_wins_perc = pd.read_csv("datasets/our_data/close_wins_perc", index_col=0)
momentum = pd.read_csv("datasets/our_data/momentum", index_col=0)
weighted_wins = pd.read_csv("datasets/our_data/weighted_wins", index_col=0)
seed_matrix_df = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

In [20]:
# get data into correct format
predictor_names = ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"] 
predictor_dfs = [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum]                                           

In [53]:
reload(mmc)

<module 'march_madness_classes' from 'march_madness_classes.py'>

In [87]:
pred, resp = mmc.generate_multiple_years_of_games(range(1987, 1999), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum],
                                                  scoring_dif = False)


In [89]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LogReg

In [90]:
scaler = StandardScaler().fit(pred.iloc[:, [2,4,5,6,7,10]])

log_reg_model = LogReg(C = 10)
log_reg_model.fit(scaler.transform(pred.iloc[:, [2,4,5,6,7,10]]), resp.values.T[0])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Test on 1999

In [91]:
# get data needed
seeds_1999 = seeds_arr[1999-1985] 
slots_1999 = slots_arr[1999-1985] 
games_1999 = games_arr[1999-1985]
predictors = ["min_index_team", "max_index_team", "markov_data", "rpi", "bad_losses", "tough_wins", "close_wins", "past_resul"] 
predictor_dfs = [markov_data, rpi, bad_losses, tough_wins, close_wins, past_resul]

In [279]:
reload(mmc)

<module 'march_madness_classes' from 'march_madness_classes.py'>

In [284]:
head_to_head_model_1999 = mmc.ModelPredictor(log_reg_model, scaler, predictor_dfs, 1999, simulation=True)

simulator_1999 = mmc.Simulator(seeds_1999, slots_1999, head_to_head_model_1999)

points = simulator_1999.simulate_tournament(100)

bracket = simulator_1999.predict_tournament()

In [303]:
actual_model= mmc.ActualTournament(games_arr[1999-1985])
actual_tourney = mmc.Tournament(seeds_1999, slots_1999, actual_model, include_scoring_dif=False)

In [286]:
simulator_1999.score_tournament(actual_tourney)

Total Points  : 1120

Total Accuracy: 40 / 63 = 0.634920634921
R1    Accuracy: 22 / 32 = 0.6875
R2    Accuracy: 9 / 16 = 0.5625
R3    Accuracy: 4 / 8 = 0.5
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 2 / 2 = 1.0
R6    Accuracy: 0 / 1 = 0.0


(1120, 0.6349206349206349)

In [304]:
simple_model = mmc.ModelPredictor(log_reg_model, scaler, predictor_dfs, 1999, simulation=False)
simple_tourney = mmc.Tournament(seeds_1999, slots_1999, simple_model, include_scoring_dif=False)

simple_tourney.score_tournament(actual_tourney, print_res=True)

Total Points  : 1100

Total Accuracy: 39 / 63 = 0.619047619048
R1    Accuracy: 22 / 32 = 0.6875
R2    Accuracy: 8 / 16 = 0.5
R3    Accuracy: 4 / 8 = 0.5
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 2 / 2 = 1.0
R6    Accuracy: 0 / 1 = 0.0


(1100, 0.6190476190476191)

In [305]:
basic_model = mmc.BasicPredictor()
basic_tourney = mmc.Tournament(seeds_1999, slots_1999, basic_model, include_scoring_dif=False)

basic_tourney.score_tournament(actual_tourney, print_res=True)

Total Points  : 880

Total Accuracy: 35 / 63 = 0.555555555556
R1    Accuracy: 20 / 32 = 0.625
R2    Accuracy: 8 / 16 = 0.5
R3    Accuracy: 3 / 8 = 0.375
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 1 / 2 = 0.5
R6    Accuracy: 0 / 1 = 0.0


(880, 0.5555555555555556)