# Testing Code For Tuning Number of Upsets
- Robert Shaw
- CS109a Project: Data Driven March Madness

In this file, we test the code that allows us to tune the number of predicted upset. The actual code that allows this to work is found in march_madness_models.py. The ModelPredictor class was update to allow arguments that tune the prediction by adding bias to the modeled probability.

---

#### 1) Load in our data 

In [9]:
import march_madness_classes as mmc
import march_madness_games as mmg
import march_madness_models as mmm
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as LogReg

In [4]:
# read in the dataset
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")
games = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")

seeds_arr = mmg.filter_into_seasons(seeds)
slots_arr = mmg.filter_into_seasons(slots)
games_arr = mmg.filter_into_seasons(games)

In [5]:
# extract predictors
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
consistency = pd.read_csv("datasets/our_data/consistency", index_col=0)
dominance = pd.read_csv("datasets/our_data/dominance", index_col=0)
past_resul = pd.read_csv("datasets/our_data/past_results", index_col=0)
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)
tough_wins = pd.read_csv("datasets/our_data/tough_wins", index_col=0)
close_games= pd.read_csv("datasets/our_data/close_games", index_col=0)
close_wins = pd.read_csv("datasets/our_data/close_wins",index_col=0)
close_wins_perc = pd.read_csv("datasets/our_data/close_wins_perc", index_col=0)
momentum = pd.read_csv("datasets/our_data/momentum", index_col=0)
weighted_wins = pd.read_csv("datasets/our_data/weighted_wins", index_col=0)
seed_matrix_df = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

In [6]:
# get data into correct format
predictor_names = ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"] 
predictor_dfs = [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum]                                           

---

#### 2) Train a Basic Model for Testing Our Code

In [7]:
# pred and resp to build a model
pred, resp = mmg.generate_multiple_years_of_games(range(1987, 2001), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum],
                                                  scoring_dif = False)

In [10]:
# build a simple model for testing our adjusted model
scaler = StandardScaler().fit(pred.iloc[:, [2,4,5,6,7,10]])

log_reg_model = LogReg(C = 10)
log_reg_model.fit(scaler.transform(pred.iloc[:, [2,4,5,6,7,10]]), resp.values.T[0])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

---

#### 3) Test Functionality of the Code on 2002
Note: the code that we are testing is in march_madness_models.py

In [190]:
# get data needed
predictors = ["min_index_team", "max_index_team", "markov_data", "rpi", "bad_losses", "tough_wins", "close_wins", "past_resul"] 
predictor_dfs = [markov_data, rpi, bad_losses, tough_wins, close_wins, past_resul]

In [203]:
reload(mmm)

<module 'march_madness_models' from 'march_madness_models.py'>

In [215]:
year = 2002
seeds = seeds_arr[year - 1985]
slots = slots_arr[year - 1985]
resul = games_arr[year - 1985]

# biased model ---> predicts upsets with proability p + .05 (where p comes from log reg model)
head_to_head_biased_model = mmm.ModelPredictor(log_reg_model, 
                                               scaler,
                                               predictor_dfs, 
                                               year, 
                                               simulation=False, 
                                               higher_seed_bias=True, 
                                               seeds_df=seeds, 
                                               higher_seed_bias_delta=.05)


# biased model with cooling ----> predicts upsets with proability p + .05 * cooling_factor (where p comes from log reg model)
head_to_head_biased_model_cooling = mmm.ModelPredictor(log_reg_model, 
                                                       scaler, 
                                                       predictor_dfs, 
                                                       year, 
                                                       seeds_df=seeds, 
                                                       simulation=False, 
                                                       higher_seed_bias=True, 
                                                       higher_seed_bias_delta=.01,
                                                       cooling = {6:10, 5:10, 4:10, 3:10, 2:-5, 1:-5}
                                                      )

# unbiased model ----> predicts based on head to head log reg model
head_to_head_unbiased_model = mmm.ModelPredictor(log_reg_model, scaler, predictor_dfs, year, simulation=False, higher_seed_bias=False, seeds_df=seeds)


# for comparison
tourney_actual = mmc.Tournament(seeds, slots, mmm.ActualTournament(resul))
tourney_top_seed = mmc.Tournament(seeds, slots, mmm.BasicPredictor())

# predict tournament
tourney_biased_model = mmc.Tournament(seeds, slots, head_to_head_biased_model)
tourney_biased_model_cooling = mmc.Tournament(seeds, slots, head_to_head_biased_model_cooling)
tourney_unbiased_model = mmc.Tournament(seeds, slots, head_to_head_unbiased_model)

----

#### 4) Results

In [216]:
# predicting the higher seed each game
tourney_top_seed.score_tournament(tourney_actual, print_res=True)

Total Points  : 1290

Total Accuracy: 44 / 63 = 0.698412698413
R1    Accuracy: 25 / 32 = 0.78125
R2    Accuracy: 10 / 16 = 0.625
R3    Accuracy: 5 / 8 = 0.625
R4    Accuracy: 2 / 4 = 0.5
R5    Accuracy: 1 / 2 = 0.5
R6    Accuracy: 1 / 1 = 1.0


(1290, 0.6984126984126984)

In [217]:
# predicting the with biased model
tourney_biased_model.score_tournament(tourney_actual, print_res=True)

Total Points  : 740

Total Accuracy: 39 / 63 = 0.619047619048
R1    Accuracy: 24 / 32 = 0.75
R2    Accuracy: 9 / 16 = 0.5625
R3    Accuracy: 4 / 8 = 0.5
R4    Accuracy: 2 / 4 = 0.5
R5    Accuracy: 0 / 2 = 0.0
R6    Accuracy: 0 / 1 = 0.0


(740, 0.6190476190476191)

In [218]:
# predicting the with biased model cooling
tourney_biased_model_cooling.score_tournament(tourney_actual, print_res=True)

Total Points  : 800

Total Accuracy: 40 / 63 = 0.634920634921
R1    Accuracy: 24 / 32 = 0.75
R2    Accuracy: 10 / 16 = 0.625
R3    Accuracy: 3 / 8 = 0.375
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 0 / 2 = 0.0
R6    Accuracy: 0 / 1 = 0.0


(800, 0.6349206349206349)

In [219]:
# predicting with 
tourney_unbiased_model.score_tournament(tourney_actual, print_res=True)

Total Points  : 720

Total Accuracy: 38 / 63 = 0.603174603175
R1    Accuracy: 24 / 32 = 0.75
R2    Accuracy: 8 / 16 = 0.5
R3    Accuracy: 4 / 8 = 0.5
R4    Accuracy: 2 / 4 = 0.5
R5    Accuracy: 0 / 2 = 0.0
R6    Accuracy: 0 / 1 = 0.0


(720, 0.6031746031746031)

---

#### 5) Compare Models

In [220]:
# check differences with the unbiased model
tourney_biased_model.score_tournament(tourney_unbiased_model, print_res=True)

Total Points  : 1820

Total Accuracy: 58 / 63 = 0.920634920635
R1    Accuracy: 30 / 32 = 0.9375
R2    Accuracy: 14 / 16 = 0.875
R3    Accuracy: 7 / 8 = 0.875
R4    Accuracy: 4 / 4 = 1.0
R5    Accuracy: 2 / 2 = 1.0
R6    Accuracy: 1 / 1 = 1.0


(1820, 0.9206349206349206)

In [221]:
# check differences with the unbiased model
tourney_biased_model_cooling.score_tournament(tourney_unbiased_model, print_res=True)

Total Points  : 1700

Total Accuracy: 57 / 63 = 0.904761904762
R1    Accuracy: 32 / 32 = 1.0
R2    Accuracy: 13 / 16 = 0.8125
R3    Accuracy: 6 / 8 = 0.75
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 2 / 2 = 1.0
R6    Accuracy: 1 / 1 = 1.0


(1700, 0.9047619047619048)

In [225]:
tourney_biased_model_cooling.score_tournament(tourney_biased_model, print_res=True)

Total Points  : 1680

Total Accuracy: 54 / 63 = 0.857142857143
R1    Accuracy: 30 / 32 = 0.9375
R2    Accuracy: 11 / 16 = 0.6875
R3    Accuracy: 7 / 8 = 0.875
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 2 / 2 = 1.0
R6    Accuracy: 1 / 1 = 1.0


(1680, 0.8571428571428571)

In [222]:
# check differences with the top seed model
tourney_biased_model.score_tournament(tourney_top_seed, print_res=True)

Total Points  : 1230

Total Accuracy: 51 / 63 = 0.809523809524
R1    Accuracy: 29 / 32 = 0.90625
R2    Accuracy: 11 / 16 = 0.6875
R3    Accuracy: 6 / 8 = 0.75
R4    Accuracy: 4 / 4 = 1.0
R5    Accuracy: 1 / 2 = 0.5
R6    Accuracy: 0 / 1 = 0.0


(1230, 0.8095238095238095)

In [223]:
# check differences with the top seed model
tourney_biased_model_cooling.score_tournament(tourney_top_seed, print_res=True)

Total Points  : 1230

Total Accuracy: 56 / 63 = 0.888888888889
R1    Accuracy: 31 / 32 = 0.96875
R2    Accuracy: 16 / 16 = 1.0
R3    Accuracy: 5 / 8 = 0.625
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 1 / 2 = 0.5
R6    Accuracy: 0 / 1 = 0.0


(1230, 0.8888888888888888)

In [224]:
# check differences with the top seed model
tourney_unbiased_model.score_tournament(tourney_top_seed, print_res=True)

Total Points  : 1330

Total Accuracy: 56 / 63 = 0.888888888889
R1    Accuracy: 31 / 32 = 0.96875
R2    Accuracy: 13 / 16 = 0.8125
R3    Accuracy: 7 / 8 = 0.875
R4    Accuracy: 4 / 4 = 1.0
R5    Accuracy: 1 / 2 = 0.5
R6    Accuracy: 0 / 1 = 0.0


(1330, 0.8888888888888888)