In [1]:
%matplotlib inline

import os
import pandas
import numpy
import sklearn
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor




In [2]:
polls_file_name = 'full_bracket.poll_of_polls.csv'
streak_file_name = 'composite_streak_data.csv'

polls = pandas.read_csv(polls_file_name, sep='|')
streak = pandas.read_csv(streak_file_name, sep='|').drop('Unnamed: 0', axis=1)

In [3]:
# Brooklyn is missing from the data somehow - so add them in by hand

polls = polls.append(pandas.DataFrame({'Season': [2018], 
                                       'TeamID': [1134], 
                                       'Team': ['Brooklyn'], 
                                       'rank': [267],
                                       'norm_rank': [5.0],
                                       'noise': [0.0]}))
for streak_len in [2, 3, 4, 6, 8, 10]:
    streak = streak.append(pandas.DataFrame({'Season': [2018], 
                                             'TeamID': [1134], 
                                             'TeamName': ['Brooklyn'], 
                                             'streak': [0.5],
                                             'StreakLen': [streak_len]}))


In [4]:
teams = pandas.read_csv('data/kaggle_2018_final/DataFiles/Teams.csv')
slots = pandas.read_csv('slots_2018.csv')
seeds = pandas.read_csv('seeds_2018.csv')


In [9]:
def round_results(r):
    season = 2018
    daynum = -1
    wloc = 'X'
    numot = -1

    results = {
        'Season': list(),
        'DayNum': list(),
        'WTeamID': list(),
        'WScore': list(),
        'LTeamID': list(),
        'LScore': list(),
        'WLoc': list(),
        'NumOT': list(),
        'minID': list(),
        'maxID': list()
    }
    
    for idx, row in r.iterrows():
        
        sid = row['StrongTeamID']
        wid = row['WeakTeamID']

        #print sid, wid, all([~numpy.isnan(sid), ~numpy.isnan(wid)]), row['StrongTeamRank'] > row['WeakTeamRank']

        if all([~numpy.isnan(sid), ~numpy.isnan(wid)]):    
            minid = min(sid, wid)
            maxid = max(sid, wid)
            
            # Play-in results:
            if (minid, maxid) == (1382, 1417):
                # UCLA: 1417 (58), St Bonaventure: 1382 (65)
                win_id  = 1382
                loss_id = 1417
                win_sc  = 65
                loss_sc = 58
            elif (minid, maxid) == (1134, 1347):
                # Radford: 1347 (71), Brooklyn: 1134 (61)
                win_id  = 1347
                loss_id = 1134
                win_sc  = 71
                loss_sc = 61
            elif (minid, maxid) == (1300, 1411):
                # TX Southern: 1411 (64), NC Central: 1300 (46)
                win_id  = 1411
                loss_id = 1300
                win_sc  = 64
                loss_sc = 46
            elif (minid, maxid) == (1113, 1393):
                # Arizona St: 1113 (56), Syracuse: 1393 (60) 
                win_id  = 1393
                loss_id = 1113
                win_sc  = 60
                loss_sc = 56
            elif row['StrongTeamRank'] <= row['WeakTeamRank']:
                # Strong seed is higher ranked and wins
                win_id = sid
                loss_id = wid
                win_sc = 1
                loss_sc = 0
            else:
                # Week seed is higher ranked and wins
                win_id = wid
                loss_id = sid
                win_sc = 1
                loss_sc = 0

            results['Season'].append(season)
            results['DayNum'].append(daynum)
            results['WTeamID'].append(win_id)
            results['WScore'].append(win_sc)
            results['LTeamID'].append(loss_id)
            results['LScore'].append(loss_sc)
            results['WLoc'].append(wloc)
            results['NumOT'].append(numot)
            results['minID'].append(minid)
            results['maxID'].append(maxid)

    results = pandas.DataFrame(results)
    
    return results

In [None]:
scenarios_select = [
    # Deterministic scenarios
    {'alpha':  0.0, 'beta': 0.00, 'streak': None, 'model': 'p',                   'output': 'data/brackets/bracket_pp'},
    {'alpha': -0.1, 'beta': 0.00, 'streak':  6,   'model': 'a',                   'output': 'data/brackets/bracket_a01_b00_s06'},
    {'alpha': -0.1, 'beta': 0.00, 'streak': 10,   'model': 'a', 'penalty': True,  'output': 'data/brackets/bracket_a01_b00_s10_pen'},
    {'alpha': -0.2, 'beta': 0.00, 'streak':  6,   'model': 'a',                   'output': 'data/brackets/bracket_a02_b00_s06'},
    {'alpha': -0.2, 'beta': 0.00, 'streak': 10,   'model': 'a', 'penalty': True,  'output': 'data/brackets/bracket_a02_b00_s10_pen'},

    # Random scenarios
    {'alpha':  0.0, 'beta': 0.05, 'streak': None, 'model': 'b',                   'output': 'data/brackets/bracket_a00_b05_sNA'},    
    {'alpha':  0.0, 'beta': 0.15, 'streak': None, 'model': 'b',                   'output': 'data/brackets/bracket_a00_b15_sNA'},    
    {'alpha': -0.1, 'beta': 0.05, 'streak': 6,    'model': 'ab', 'penalty': True, 'output': 'data/brackets/bracket_a01_b05_s06_pen'},
    {'alpha': -0.1, 'beta': 0.05, 'streak': 10,   'model': 'ab', 'penalty': True, 'output': 'data/brackets/bracket_a01_b05_s10_pen'},
    {'alpha': -0.1, 'beta': 0.15, 'streak': 6,    'model': 'ab', 'penalty': True, 'output': 'data/brackets/bracket_a01_b15_s06_pen'},
    {'alpha': -0.1, 'beta': 0.15, 'streak': 10,   'model': 'ab', 'penalty': True, 'output': 'data/brackets/bracket_a01_b15_s10_pen'},
]

In [38]:
season = 2018
bracket_file_name = 'data/brackets/bracket_a01_b15_s10_pen_0003.csv'

bracket_data = pandas.read_csv(bracket_file_name, sep='|')

final_teams = list()

winners = dict([rnd, list()] for rnd in [1, 2, 3, 4, 5, 6])

# initialize
df = slots.merge(seeds, left_on=['Season', 'StrongSeed'], right_on=['Season', 'Seed'], how='left')
df = df.rename(index=str, columns={"TeamID": "StrongTeamID"}).drop('Seed', axis=1)
df = df.merge(seeds, left_on=['Season', 'WeakSeed'], right_on=['Season', 'Seed'], how='left')
df = df.rename(index=str, columns={"TeamID": "WeakTeamID"}).drop('Seed', axis=1)

# reduce to the 68 teams in the tourney
# teams_68 = map(int, list(df['StrongTeamID']) + list(df['WeakTeamID']))
teams_68 = map(int, set([id for id in list(df['StrongTeamID']) + list(df['WeakTeamID']) if not numpy.isnan(id)]))
bracket_68 = bracket_data[bracket_data['TeamID'].isin(teams_68)].copy()

rnd = 0
df = df.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='StrongTeamID', right_on='TeamID', how='left')
df = df.rename(index=str, columns={"Rank_{}".format(rnd): "StrongTeamRank"}).drop('TeamID', axis=1)
df = df.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='WeakTeamID', right_on='TeamID', how='left')
df = df.rename(index=str, columns={"Rank_{}".format(rnd): "WeakTeamRank"}).drop('TeamID', axis=1)

# play-in games
pi_games = ~(df['Slot'].str.contains('R')) & (df['Season'] == season)
pi = df[pi_games]

pi.loc[pi_games, 'minID'] = pi[['StrongTeamID', 'WeakTeamID']].min(axis=1)
pi.loc[pi_games, 'maxID'] = pi[['StrongTeamID', 'WeakTeamID']].max(axis=1)

results = round_results(pi)

pi = pi.merge(results, on=['Season', 'minID', 'maxID'], how='left')

pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['StrongTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "StrongTeamName"}).drop('TeamID', axis=1)
pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['WeakTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "WeakTeamName"}).drop('TeamID', axis=1)
pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['WTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "WTeamName"}).drop('TeamID', axis=1)
pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['LTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "LTeamName"}).drop('TeamID', axis=1)

# regular rounds
rounds = [pi]

for rnd in range(1, 6 + 1):
    last_rnd = rounds[-1]

    r_games = (df['Slot'].str.contains('R{}..'.format(rnd))) & (df['Season'] == season)
    r = df[r_games]

    r = r.merge(last_rnd[['Slot', 'WTeamID']], left_on='StrongSeed', right_on='Slot', how='left', suffixes=['', '__'])
    r.loc[r['StrongTeamID'].isnull(), 'StrongTeamID'] = r['WTeamID']
    r = r.drop(['Slot__', 'WTeamID'], axis=1)

    r = r.merge(last_rnd[['Slot', 'WTeamID']], left_on='WeakSeed', right_on='Slot', how='left', suffixes=['', '__'])
    r.loc[r['WeakTeamID'].isnull(), 'WeakTeamID'] = r['WTeamID']
    r = r.drop(['Slot__', 'WTeamID'], axis=1)

    r['minID'] = r[['StrongTeamID', 'WeakTeamID']].min(axis=1)
    r['maxID'] = r[['StrongTeamID', 'WeakTeamID']].max(axis=1)

    # make sure to use the ranking for the correct round #
    r['StrongTeamRank'] = r.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='StrongTeamID', right_on='TeamID', how='left')['Rank_{}'.format(rnd)]
    r['WeakTeamRank'] = r.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='WeakTeamID', right_on='TeamID', how='left')['Rank_{}'.format(rnd)]

    results = round_results(r)

    r = r.merge(results, on=['Season', 'minID', 'maxID'], how='left')

    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['StrongTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "StrongTeamName"}).drop('TeamID', axis=1)
    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['WeakTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "WeakTeamName"}).drop('TeamID', axis=1)
    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['WTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "WTeamName"}).drop('TeamID', axis=1)
    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['LTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "LTeamName"}).drop('TeamID', axis=1)

    winners[rnd].append(list(r['WTeamName']))

    rounds.append(r.copy())




In [39]:
for rnd in [0, 1, 2, 3, 4, 5, 6]:
    for idx, row in rounds[rnd][['WTeamName', 'LTeamName']].iterrows():
        print '{:3}\t{:20}\t{:20}'.format(rnd,row['WTeamName'], row['LTeamName'])


  0	TX Southern         	NC Central          
  0	St Bonaventure      	UCLA                
  0	Radford             	Brooklyn            
  0	Syracuse            	Arizona St          
  1	Xavier              	TX Southern         
  1	North Carolina      	Lipscomb            
  1	Michigan            	Montana             
  1	Gonzaga             	UNC Greensboro      
  1	Ohio St             	S Dakota St         
  1	Houston             	San Diego St        
  1	Texas A&M           	Providence          
  1	Florida St          	Missouri            
  1	Virginia            	UMBC                
  1	Cincinnati          	Georgia St          
  1	Tennessee           	Wright St           
  1	Arizona             	Buffalo             
  1	Kentucky            	Davidson            
  1	Miami FL            	Loyola-Chicago      
  1	Texas               	Nevada              
  1	Creighton           	Kansas St           
  1	Villanova           	Radford             
  1	Purdue              	CS Fuller

In [14]:
winners[5]

[['Cincinnati', 'Villanova']]