In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.preprocessing import StandardScaler

import march_madness_classes as mmc

## 1) Confirm Baseline Model is Working

In [2]:
# read in the dataset
tournament_data = pd.read_csv("datasets/kaggle_data/TourneyCompactResults.csv")
teams = pd.read_csv("datasets/kaggle_data/Teams.csv")
seeds = pd.read_csv("datasets/kaggle_data/TourneySeeds.csv")
slots = pd.read_csv("datasets/kaggle_data/TourneySlots.csv")

In [3]:
games_arr = mmc.filter_into_seasons(tournament_data)
seeds_arr = mmc.filter_into_seasons(seeds)
slots_arr = mmc.filter_into_seasons(slots)

In [None]:
tourney_summary = pd.read_csv("datasets/our_data/team_summary_data/tourney_wins_matrix", index_col=0)

In [None]:
wins_in_prev_2_year = np.zeros(tourney_summary.shape)
wins_in_prev_2_year[0:2, :] = np.nan

In [None]:
for i in range(2, tourney_summary.shape[0]):
    wins_in_prev_2_year[i, :] = tourney_summary.values[i - 1, :] + tourney_summary.values[i - 2, :]

In [None]:
past_resul = pd.DataFrame(wins_in_prev_2_year, columns = tourney_summary.columns, index= tourney_summary.index)

In [None]:
past_resul.to_csv("datasets/our_data/past_results")

In [None]:
past_resul.head()

In [None]:
pred, resp = mmc.generate_multiple_years_of_games(range(1987, 2001), seeds_arr, slots_arr, games_arr, ["min_index_id", "max_index_id", "markov", "dominance", "consistency", "prev_resul"], [markov_data, dominance, consistency, wins_in_prev_year_df])

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
train_index, cross_index = train_test_split(pred.index, test_size = .25)

train_x = pred.loc[train_index]
train_y = resp.loc[train_index]
cross_x = pred.loc[cross_index]
cross_y = resp.loc[cross_index]

In [None]:
scaler = StandardScaler().fit(train_x.iloc[:, 2].reshape(-1,1))

model = LogReg(C = 1)
model.fit(scaler.transform(train_x.iloc[:, 2].reshape(-1,1)), train_y.values.T[0])
model.score(scaler.transform(cross_x.iloc[:, 2].reshape(-1,1)), cross_y)

In [None]:
scaler = StandardScaler().fit(train_x.iloc[:, 2:])

model = LogReg(C = 1)
model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)

In [None]:
model.coef_

---

## 2) Generate Some New Predictors

In [None]:
regular_seasons = pd.read_csv("datasets/kaggle_data/RegularSeasonCompactResults.csv")

In [None]:
regular_data = mmc.filter_into_seasons(regular_seasons)

In [4]:
markov_data = pd.read_csv("datasets/our_data/stationary", index_col=0)
consistency = pd.read_csv("datasets/our_data/consistency", index_col=0)
dominance = pd.read_csv("datasets/our_data/dominance", index_col=0)
past_resul = pd.read_csv("datasets/our_data/past_results", index_col=0)

---

### Convert Seeds to Numeric Matrix

In [None]:
# convert seeds arr to numeric value
seed_matrix = np.zeros((2016 - 1985 + 1, teams.shape[0]))

i = 0
for year in range(1985, 2017):
    j = 0
    for team in teams["Team_Id"]:
        seeds_in_year_i = seeds_arr[i]
        team_seed_in_year_i = seeds_in_year_i.loc[seeds_in_year_i["Team"] == team, "Seed"]
        
        seed = np.nan
        if len(team_seed_in_year_i.values) != 0:
            seed = team_seed_in_year_i.values[0][1:3]
    
        seed_matrix[i, j] = seed
        
        j = j + 1
        
    i = i + 1

In [None]:
seed_matrix_df = pd.DataFrame(data=seed_matrix, columns=past_resul.columns, index=past_resul.index)

seed_matrix_df.to_csv("datasets/our_data/team_summary_data/seeds_matrix")

In [5]:
seed_matrix_df = pd.read_csv("datasets/our_data/team_summary_data/seeds_matrix", index_col=0)

---

### Weighted Wins

In [None]:
# weighted wins = \sum (wins / seed)
weighted_wins_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    # iterate teams
    for team in teams["Team_Id"]:
        # extract games the team won
        wins = regular_data[i].loc[regular_data[i]["Wteam"] == team]
        
        # start with 0
        weighted_wins = 0
        
        # iterate wins
        for index, game in wins.iterrows():
            l_team = game["Lteam"]
            l_team_seed = seed_matrix_df.loc[year, str(l_team)]
        
            # if the loser is in the tourney, then add to weighted wins
            if ~np.isnan(l_team_seed):
                # ww = 1 / (lteam seed)
                weighted_wins = weighted_wins + 1./ l_team_seed
                
        # put into our array
        weighted_wins_np[i, j] = weighted_wins
        
        j = j + 1
        
    i = i + 1

In [None]:
weighted_wins = pd.DataFrame(data=weighted_wins_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)

weighted_wins.to_csv("datasets/our_data/weighted_wins")

In [6]:
weighted_wins = pd.read_csv("datasets/our_data/weighted_wins", index_col=0)

---

### Momentum (Markov * Wins in Last 30 Days)

In [None]:
momentum_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    # iterate teams
    for team in teams["Team_Id"]:
        season = regular_data[i]
        
        # get the last 30 days of the season
        max_day = season["Daynum"].max()
        last_month = season[season["Daynum"] >= max_day - 30]
        
        # wins
        wins = last_month[last_month["Wteam"] == team]
        weighted_wins = 0.
        
        for index, win in wins.iterrows():
            l_team = win["Lteam"]
            l_team_pi = markov_data.loc[year, str(l_team)]
            
            weighted_wins = weighted_wins + l_team_pi
        
        momentum_np[i, j] = weighted_wins
        j = j + 1
    i = i + 1

In [None]:
momentum = pd.DataFrame(data=momentum_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)

momentum.to_csv("datasets/our_data/momentum")

In [7]:
momentum = pd.read_csv("datasets/our_data/momentum", index_col=0)

---

### Close Win Percentage (OT Wins/ Wins By <= 1 Basket)

In [None]:
close_wins_np = np.zeros(seed_matrix_df.shape)
close_games_np = np.zeros(seed_matrix_df.shape)
close_game_win_percetage_np = np.zeros(seed_matrix_df.shape)

i = 0
# iterate years
for year in range(1985, 2017):
    j = 0
    # iterate teams
    for team in teams["Team_Id"]:
        season = regular_data[i]
        
        # games 
        wins = season.loc[season["Wteam"] == team]
        losses = season.loc[season["Lteam"] == team]
        
        close_wins = 0.
        close_losses = 0.
        
        # iter wins, count close wins
        for index, win in wins.iterrows():
            if win["Wscore"] - win["Lscore"] <= 3:
                close_wins = close_wins + 1.
            elif win["Numot"] > 0:
                close_wins = close_wins + 1.
        
        for index, loss in losses.iterrows():
            if loss["Wscore"] - loss["Lscore"] <= 3:
                close_losses = close_losses + 1.
            elif loss["Numot"] > 0:
                close_losses = close_losses + 1.
        
        close_wins_np[i, j] = close_wins
        close_games_np[i, j] = close_wins + close_losses
        
        if close_wins > 0:
            close_game_win_percetage_np[i,j] = close_wins / (close_wins +  close_losses)
             
        j = j + 1
    print year
    i = i + 1

In [None]:
# save to database
close_games = pd.DataFrame(data=close_games_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_games.to_csv("datasets/our_data/close_games")

close_wins = pd.DataFrame(data=close_wins_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_wins.to_csv("datasets/our_data/close_wins")

close_wins_perc = pd.DataFrame(data=close_game_win_percetage_np, columns=seed_matrix_df.columns, index=seed_matrix_df.index)
close_wins_perc.to_csv("datasets/our_data/close_wins_perc")

In [8]:
close_games= pd.read_csv("datasets/our_data/close_games", index_col=0)
close_wins = pd.read_csv("datasets/our_data/close_wins",index_col=0)
close_wins_perc = pd.read_csv("datasets/our_data/close_wins_perc", index_col=0)

---

### Extract Predictors from Original Data Exploration

In [9]:
rpi = pd.read_csv("datasets/our_data/rpi", index_col=0)
bad_losses = pd.read_csv("datasets/our_data/bad_losses", index_col=0)
tough_wins = pd.read_csv("datasets/our_data/tough_wins", index_col=0)

### Model Head to Head Wins

In [10]:
from sklearn.cross_validation import train_test_split

In [54]:
pred_cat, resp_cat = mmc.generate_multiple_years_of_games(range(1987, 2001), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum],
                                                  scoring_dif = False)

In [19]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, [2,4,5,6,7,10]])

    model = LogReg(C = 10)
    model.fit(scaler.transform(train_x.iloc[:, [2,4,5,6,7,10]]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, [2,4,5,6,7,10]]), cross_y)
    
print scores/500.
print model.coef_

0.734461538462
[[ 0.79104486  0.51043614 -0.32852101  0.06662859  0.08042103  0.01031312]]


In [51]:
reload(mmc)

<module 'march_madness_classes' from 'march_madness_classes.py'>

--- 

### Model Score Differential

In [66]:
from sklearn.linear_model import LinearRegression as ols
from sklearn.linear_model import Lasso as lasso
from sklearn.linear_model import Ridge as ridge

In [52]:
pred, resp = mmc.generate_multiple_years_of_games(range(1987, 2001), 
                                                  seeds_arr, 
                                                  slots_arr, 
                                                  games_arr, 
                                                  ["min_index_id", "max_index_id", "markov", "dominance", "rpi", "bad_losses", "tough_wins", "close_wins", "close_wins_perc", "weighted_wins", "past_resul", "momentum"], 
                                                  [markov_data, dominance, rpi, bad_losses, tough_wins, close_wins, close_wins_perc, weighted_wins, past_resul, momentum],
                                                  scoring_dif = True)

In [90]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = ols()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.452780184965
[ 4.75784238  0.38380117  6.7311316   0.05897483  0.7046769   0.75246641
 -1.49653574 -1.0102595  -0.1725533  -0.82996535]


In [64]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = lasso()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.455783430671
[ 5.47783697  0.          3.84944507 -0.78811087  0.         -0.         -0.0559341
  0.          0.          0.        ]


In [67]:
scores = 0
for i in range(500):
    train_index, cross_index = train_test_split(pred.index, test_size = .25)

    train_x = pred.loc[train_index]
    train_y = resp.loc[train_index]
    cross_x = pred.loc[cross_index]
    cross_y = resp.loc[cross_index]

    scaler = StandardScaler().fit(train_x.iloc[:, 2:])

    model = ridge()
    model.fit(scaler.transform(train_x.iloc[:, 2:]), train_y.values.T[0])
    scores = scores + model.score(scaler.transform(cross_x.iloc[:, 2:]), cross_y)
    
print scores/500.
print model.coef_

0.456007369246
[ 6.38027814 -1.18758073  6.03088582 -0.67605498  0.13853031  0.36738729
 -1.22640209 -1.45729778 -0.22821256 -0.21286539]
