# Baseline Modeling
#### Robert Shaw
#### CS109a Project: Data Driven March Madness

---

### Load in Our Datasets and Functions

In [1]:
import march_madness_classes as mmc

# import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
teams = pd.read_csv("datasets/kaggle_data/Teams.csv")
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")
tourney_data = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")
regular_data = pd.read_csv("datasets/kaggle_data/RegularSeasonCompactResults.csv")

In [3]:
tourney_arr = mmc.filter_into_seasons(tourney_data)
regular_arr = mmc.filter_into_seasons(regular_data)
seeds_arr = mmc.filter_into_seasons(seeds)
slots_arr = mmc.filter_into_seasons(slots)

seeds_1985 = seeds_arr[0]
slots_1985 = slots_arr[0]
tourney_1985 = tourney_arr[0]
regular_1985 = regular_arr[0]

---

### Transform the Stationary Distibution Data Into One DataFrame/CSV File

In [4]:
stationary_arr = []
for i in range(1985, 2017):
    df = pd.read_csv("datasets/our_data/markov_data/{}_stationary_distribution".format(i), index_col = 0)
    df.columns = [i]
    stationary_arr.append(df.transpose()) 

In [5]:
# concat into a 
stationary_df = pd.concat(stationary_arr)

In [6]:
# save as a csv
stationary_df.to_csv("datasets/our_data/stationary")

In [7]:
# everything appears to be working if they sum to one
stationary_df.sum(axis=1)

1985    1.0
1986    1.0
1987    1.0
1988    1.0
1989    1.0
1990    1.0
1991    1.0
1992    1.0
1993    1.0
1994    1.0
1995    1.0
1996    1.0
1997    1.0
1998    1.0
1999    1.0
2000    1.0
2001    1.0
2002    1.0
2003    1.0
2004    1.0
2005    1.0
2006    1.0
2007    1.0
2008    1.0
2009    1.0
2010    1.0
2011    1.0
2012    1.0
2013    1.0
2014    1.0
2015    1.0
2016    1.0
dtype: float64

---

### Get Predictors For Our Baseline Model

In [8]:
stationary = pd.read_csv("datasets/our_data/stationary", index_col =0)
avg_points_against =  pd.read_csv("datasets/our_data/avg_points_against", index_col =0)
avg_points_for = pd.read_csv("datasets/our_data/avg_points_for", index_col =0)
away_wins = pd.read_csv("datasets/our_data/away_wins", index_col =0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col =0)
consistency= pd.read_csv("datasets/our_data/consistency", index_col =0)
dominance= pd.read_csv("datasets/our_data/dominance", index_col =0)
good_wins_matrix= pd.read_csv("datasets/our_data/good_wins_matrix", index_col =0)
rpi= pd.read_csv("datasets/our_data/rpi", index_col =0)
tough_wins= pd.read_csv("datasets/our_data/tough_wins", index_col =0)
win_percentage= pd.read_csv("datasets/our_data/win_percentage", index_col =0)
win_percentage_vs_tourney_teams_matrix= pd.read_csv("datasets/our_data/win_percentage_vs_tourney_teams_matrix", index_col =0)
wins_vs_tourney_teams= pd.read_csv("datasets/our_data/wins_vs_tourney_teams", index_col =0)

---

### Functions to go into our March Madness Classes Module

In [9]:
# extracting from the dataframe
def get_predictor(team_id, year, df):
    return df.loc[year, str(team_id)]

def get_predictor_dif(team_id_1, team_id_2, year, df):
    return df.loc[year, str(team_id_1)] - df.loc[year, str(team_id_2)]

def get_predictors(team_id, year, df_arr):
    row = np.zeros(len(df_arr))
    i = 0
    for df in df_arr:
        row[i] = get_predictor(team_id, year, df)
        i = i + 1
    return row

def get_predictors_dif(team_id_1, team_id_2, year, df_arr):
    row = np.zeros(len(df_arr))
    i = 0
    for df in df_arr:
        row[i] = float(get_predictor_dif(team_id_1, team_id_2, year, df))
        i = i + 1
    return row

In [10]:
# function to extract the y_values of the team with the min index winning
def extract_response(tourney_game_df):
    
    # response for a given year
    min_index_win = np.zeros(tourney_game_df.shape[0])
    i = 0
    for index, game in tourney_game_df.iterrows():
        if int(game["Prediction"]) == min(int(game["Strongseed Team"]), int(game["Weakseed Team"])):
            min_index_win[i] = 1 
        i = i + 1
        
    
    return min_index_win

# function to extract the y_values of the team with the min index winning
def extract_predictors(tourney_game_df, predictor_list, predictor_dfs, year):
    # buffer to hold our values
    pred_matrix = np.zeros((tourney_game_df.shape[0], len(predictor_list)))
    
    # fill predictor matrix
    for i in range(tourney_game_df.shape[0]):   
        # min and max index teams
        min_index_team = min(int(tourney_game_df.loc[i, "Strongseed Team"]), int(tourney_game_df.loc[i, "Weakseed Team"]))
        max_index_team = max(int(tourney_game_df.loc[i, "Strongseed Team"]), int(tourney_game_df.loc[i, "Weakseed Team"]))                  

        # fill matrix
        pred_matrix[i,  0] = min_index_team
        pred_matrix[i,  1] = max_index_team
        pred_matrix[i, 2:] = get_predictors_dif(min_index_team, max_index_team, year, predictor_dfs)

    # gen dataframe                       
    pred_df = pd.DataFrame(data = pred_matrix, columns = predictor_list)
                           
    return pred_df

In [11]:
def get_tourney_results(seeds, slots, raw_data):
    tourney = mmc.Tournament(seeds, slots, mmc.ActualTournament(raw_data))
    tourney.simulate_tournament()
    return tourney.entire_bracket

# get single years worth of games
def generate_single_year_of_games(year, seed_list, slot_list, tourney_data, predictors, predictor_dfs):
    # get results of the games
    tourney_results = get_tourney_results(seed_list, slot_list, tourney_data)
    
    # get predictors
    pred_df = extract_predictors(tourney_results, predictors, predictor_dfs, year)
    
    # get response
    resp_arr = extract_response(tourney_results)
    
    return pred_df, resp_arr

def generate_multiple_years_of_games(years, seed_list_arr, slot_list_arr, tourney_data_arr, predictors, predictor_dfs):
    min_year = 1985

    preds = pd.DataFrame({})
    resps = np.array([])
    
    for year in years:
        year_index = int(year) - min_year
        # generate 1 year of data
        pred_df, resp_arr = generate_single_year_of_games(year, 
                                                          seed_list_arr[year_index], 
                                                          slot_list_arr[year_index], 
                                                          tourney_data_arr[year_index],
                                                          predictors,
                                                          predictor_dfs)
        print year
        # add to list we are keeping 
        preds = pd.concat([preds, pred_df])
        resps = np.concatenate((resps, resp_arr))
        
    return preds, resps
        

In [12]:
column_names = ["min_index_id", "max_index_id",
"bad_losses dif",
"consistency dif",
"dominance dif",
"rpi dif",
"stationary dif",
"wins_vs_tourney_teams dif"]

In [13]:
tourney_arr = mmc.filter_into_seasons(tourney_data)
regular_arr = mmc.filter_into_seasons(regular_data)
seeds_arr = mmc.filter_into_seasons(seeds)
slots_arr = mmc.filter_into_seasons(slots)

In [14]:
predictor_dfs = [bad_losses, consistency, dominance, rpi, stationary, wins_vs_tourney_teams]

In [15]:
pred, resp = generate_multiple_years_of_games(range(1985, 2001), seeds_arr, slots_arr, tourney_arr, column_names, predictor_dfs)

1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000


In [16]:
pred_df = pred.reset_index(drop=True)

In [17]:
resp_df = pd.DataFrame(data=resp, columns=["min_index_win"])

In [18]:
print pred_df.index
print resp_df.index

RangeIndex(start=0, stop=1008, step=1)
RangeIndex(start=0, stop=1008, step=1)


In [19]:
from sklearn.cross_validation import train_test_split

In [20]:
train_index, cross_index = train_test_split(pred_df.index, test_size = .25)

In [21]:
train_x = pred_df.loc[train_index]
train_y = resp_df.loc[train_index]
cross_x = pred_df.loc[cross_index]
cross_y = resp_df.loc[cross_index]

In [22]:
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

In [23]:
logistic_model = LogReg(C=100)
lda_model = LDA()
qda_model = QDA()

In [24]:
model = logistic_model
model.fit(train_x.iloc[:, 2:], train_y.values.T[0])
model.score(cross_x.iloc[:, 2:], cross_y)

0.74206349206349209

In [25]:
model = lda_model
model.fit(train_x.iloc[:, 2:], train_y.values.T[0])
model.score(cross_x.iloc[:, 2:], cross_y)

0.76190476190476186

### Package Into a Model

In [26]:
class ModelPredictor(object):
    # init function
    def __init__(self, model, dfs_arr, year):
        self.model = model
        self.dfs_arr = dfs_arr
        self.year = year
        return
    
    # head to head predicitons
    def predict(self, team_1, team_2):
        team_1 = int(team_1)
        team_2 = int(team_2)
        
        # min and max index
        min_index_team = min(team_1, team_2)
        max_index_team = max(team_1, team_2)
        
        # get the x values
        row = get_predictors_dif(min_index_team, max_index_team, self.year, self.dfs_arr)

        # predict under model
        y_hat = model.predict(row.reshape(1,-1))
        
        if y_hat == 1:
            return min_index_team
        else:
            return max_index_team
        

In [42]:
year = 1997

seeds = seeds_arr[year - 1985]
slots = slots_arr[year - 1985]
resul = tourney_arr[year - 1985]

tourney_actual = mmc.Tournament(seeds, slots, mmc.ActualTournament(resul))
tourney_actual.simulate_tournament()

tourney_top_seed = mmc.Tournament(seeds, slots, mmc.BasicPredictor())
tourney_top_seed.simulate_tournament()

tourney_model = mmc.Tournament(seeds, slots, ModelPredictor(logistic_model, predictor_dfs, year))
tourney_model.simulate_tournament()

In [43]:
tourney_model.score_model(tourney_actual, print_res=True)

Total Points  : 960

Total Accuracy: 41 / 63 = 0.650793650794
R1    Accuracy: 24 / 32 = 0.75
R2    Accuracy: 10 / 16 = 0.625
R3    Accuracy: 3 / 8 = 0.375
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 1 / 2 = 0.5
R6    Accuracy: 0 / 1 = 0.0


(960, 0.6507936507936508)

In [44]:
tourney_top_seed.score_model(tourney_actual, print_res=True)

Total Points  : 870

Total Accuracy: 42 / 63 = 0.666666666667
R1    Accuracy: 25 / 32 = 0.78125
R2    Accuracy: 9 / 16 = 0.5625
R3    Accuracy: 5 / 8 = 0.625
R4    Accuracy: 3 / 4 = 0.75
R5    Accuracy: 0 / 2 = 0.0
R6    Accuracy: 0 / 1 = 0.0


(870, 0.6666666666666666)

In [46]:
score_mod = np.zeros(len(range(1985, 2001)))
accur_mod = np.zeros(len(range(1985, 2001)))

score_top = np.zeros(len(range(1985, 2001)))
accur_top = np.zeros(len(range(1985, 2001)))

In [53]:
i = 0
for year in range(1985, 2001):
    seeds = seeds_arr[year - 1985]
    slots = slots_arr[year - 1985]
    resul = tourney_arr[year - 1985]

    tourney_actual = mmc.Tournament(seeds, slots, mmc.ActualTournament(resul))
    tourney_actual.simulate_tournament()

    tourney_top_seed = mmc.Tournament(seeds, slots, mmc.BasicPredictor())
    tourney_top_seed.simulate_tournament()

    tourney_model = mmc.Tournament(seeds, slots, ModelPredictor(lda_model, predictor_dfs, year))
    tourney_model.simulate_tournament()
    
    score_mod[i], accur_mod[i] = tourney_model.score_model(tourney_actual, print_res=False)
    score_top[i], accur_top[i] = tourney_top_seed.score_model(tourney_actual, print_res=False)
    
    i = i + 1

In [54]:
print np.mean(score_mod)
print np.mean(accur_mod)

913.75
0.64880952381


In [55]:
print np.mean(score_top)
print np.mean(accur_top)

890.0
0.64880952381
