In [1]:
%matplotlib inline

import os
import pandas
import numpy
import sklearn
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor



In [2]:
teams = pandas.read_csv('data/kaggle_2018_final/DataFiles/Teams.csv')
slots = pandas.read_csv('slots_2018.csv')
seeds = pandas.read_csv('seeds_2018.csv')

In [3]:
def predicted_round_results(r):
    season = 2018
    daynum = -1
    wloc = 'X'
    numot = -1

    results = {
        'Season': list(),
        'DayNum': list(),
        'WTeamID': list(),
        'WScore': list(),
        'LTeamID': list(),
        'LScore': list(),
        'WLoc': list(),
        'NumOT': list(),
        'minID': list(),
        'maxID': list()
    }
    
    for idx, row in r.iterrows():
        
        sid = row['StrongTeamID']
        wid = row['WeakTeamID']

        if all([~numpy.isnan(sid), ~numpy.isnan(wid)]):    
            minid = min(sid, wid)
            maxid = max(sid, wid)
            
            # Play-in results:
            if (minid, maxid) == (1382, 1417):
                # UCLA: 1417 (58), St Bonaventure: 1382 (65)
                win_id  = 1382
                loss_id = 1417
                win_sc  = 65
                loss_sc = 58
            elif (minid, maxid) == (1134, 1347):
                # Radford: 1347 (71), Brooklyn: 1134 (61)
                win_id  = 1347
                loss_id = 1134
                win_sc  = 71
                loss_sc = 61
            elif (minid, maxid) == (1300, 1411):
                # TX Southern: 1411 (64), NC Central: 1300 (46)
                win_id  = 1411
                loss_id = 1300
                win_sc  = 64
                loss_sc = 46
            elif (minid, maxid) == (1113, 1393):
                # Arizona St: 1113 (56), Syracuse: 1393 (60) 
                win_id  = 1393
                loss_id = 1113
                win_sc  = 60
                loss_sc = 56
            elif row['StrongTeamRank'] <= row['WeakTeamRank']:
                # Strong seed is higher ranked and wins
                win_id = sid
                loss_id = wid
                win_sc = 1
                loss_sc = 0
            else:
                # Week seed is higher ranked and wins
                win_id = wid
                loss_id = sid
                win_sc = 1
                loss_sc = 0

            results['Season'].append(season)
            results['DayNum'].append(daynum)
            results['WTeamID'].append(win_id)
            results['WScore'].append(win_sc)
            results['LTeamID'].append(loss_id)
            results['LScore'].append(loss_sc)
            results['WLoc'].append(wloc)
            results['NumOT'].append(numot)
            results['minID'].append(minid)
            results['maxID'].append(maxid)

    results = pandas.DataFrame(results)
    
    return results

In [4]:
season = 2018
bracket_file_name = 'data/brackets/bracket_a01_b15_s10_pen_0003.csv'

bracket_data = pandas.read_csv(bracket_file_name, sep='|')

final_teams = list()

winners = dict([rnd, list()] for rnd in [1, 2, 3, 4, 5, 6])

# initialize
df = slots.merge(seeds, left_on=['Season', 'StrongSeed'], right_on=['Season', 'Seed'], how='left')
df = df.rename(index=str, columns={"TeamID": "StrongTeamID"}).drop('Seed', axis=1)
df = df.merge(seeds, left_on=['Season', 'WeakSeed'], right_on=['Season', 'Seed'], how='left')
df = df.rename(index=str, columns={"TeamID": "WeakTeamID"}).drop('Seed', axis=1)

# reduce to the 68 teams in the tourney
# teams_68 = map(int, list(df['StrongTeamID']) + list(df['WeakTeamID']))
teams_68 = map(int, set([id for id in list(df['StrongTeamID']) + list(df['WeakTeamID']) if not numpy.isnan(id)]))
bracket_68 = bracket_data[bracket_data['TeamID'].isin(teams_68)].copy()

rnd = 0
df = df.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='StrongTeamID', right_on='TeamID', how='left')
df = df.rename(index=str, columns={"Rank_{}".format(rnd): "StrongTeamRank"}).drop('TeamID', axis=1)
df = df.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='WeakTeamID', right_on='TeamID', how='left')
df = df.rename(index=str, columns={"Rank_{}".format(rnd): "WeakTeamRank"}).drop('TeamID', axis=1)

# play-in games
pi_games = ~(df['Slot'].str.contains('R')) & (df['Season'] == season)
pi = df[pi_games].copy()

pi.loc[pi_games, 'minID'] = pi[['StrongTeamID', 'WeakTeamID']].min(axis=1)
pi.loc[pi_games, 'maxID'] = pi[['StrongTeamID', 'WeakTeamID']].max(axis=1)

results = predicted_round_results(pi)

pi = pi.merge(results, on=['Season', 'minID', 'maxID'], how='left')

pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['StrongTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "StrongTeamName"}).drop('TeamID', axis=1)
pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['WeakTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "WeakTeamName"}).drop('TeamID', axis=1)
pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['WTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "WTeamName"}).drop('TeamID', axis=1)
pi = pi.merge(teams[['TeamID', 'TeamName']], left_on=['LTeamID'], right_on=['TeamID'], how='left')
pi = pi.rename(index=str, columns={"TeamName": "LTeamName"}).drop('TeamID', axis=1)

# regular rounds
rounds = [pi]

for rnd in range(1, 6 + 1):
    last_rnd = rounds[-1]

    r_games = (df['Slot'].str.contains('R{}..'.format(rnd))) & (df['Season'] == season)
    r = df[r_games].copy()

    r = r.merge(last_rnd[['Slot', 'WTeamID']], left_on='StrongSeed', right_on='Slot', how='left', suffixes=['', '__'])
    r.loc[r['StrongTeamID'].isnull(), 'StrongTeamID'] = r['WTeamID']
    r = r.drop(['Slot__', 'WTeamID'], axis=1)

    r = r.merge(last_rnd[['Slot', 'WTeamID']], left_on='WeakSeed', right_on='Slot', how='left', suffixes=['', '__'])
    r.loc[r['WeakTeamID'].isnull(), 'WeakTeamID'] = r['WTeamID']
    r = r.drop(['Slot__', 'WTeamID'], axis=1)

    r['minID'] = r[['StrongTeamID', 'WeakTeamID']].min(axis=1)
    r['maxID'] = r[['StrongTeamID', 'WeakTeamID']].max(axis=1)

    # make sure to use the ranking for the correct round #
    r['StrongTeamRank'] = r.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='StrongTeamID', right_on='TeamID', how='left')['Rank_{}'.format(rnd)]
    r['WeakTeamRank'] = r.merge(bracket_68[['TeamID', 'Rank_{}'.format(rnd)]], left_on='WeakTeamID', right_on='TeamID', how='left')['Rank_{}'.format(rnd)]

    results = predicted_round_results(r)
    
    r = r.merge(results, on=['Season', 'minID', 'maxID'], how='left')

    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['StrongTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "StrongTeamName"}).drop('TeamID', axis=1)
    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['WeakTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "WeakTeamName"}).drop('TeamID', axis=1)
    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['WTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "WTeamName"}).drop('TeamID', axis=1)
    r = r.merge(teams[['TeamID', 'TeamName']], left_on=['LTeamID'], right_on=['TeamID'], how='left')
    r = r.rename(index=str, columns={"TeamName": "LTeamName"}).drop('TeamID', axis=1)

    winners[rnd].append(list(r['WTeamName']))

    rounds.append(r.copy())

In [5]:
predicted_bracket = {'Round': list(), 
                     'Slot': list(), 
                     'StrongTeamID': list(),
                     'WeakTeamID': list(),
                     'WTeamID': list(),
                     'LTeamID': list()}
for rnd in [0, 1, 2, 3, 4, 5, 6]:
    for idx, row in rounds[rnd][['Slot', 'StrongTeamID', 'WeakTeamID', 'WTeamID', 'LTeamID']].iterrows():
        predicted_bracket['Round'].append(rnd)
        predicted_bracket['Slot'].append(row['Slot'])
        predicted_bracket['WTeamID'].append(int(row['WTeamID']))
        predicted_bracket['LTeamID'].append(int(row['LTeamID']))
        predicted_bracket['StrongTeamID'].append(int(row['StrongTeamID']))
        predicted_bracket['WeakTeamID'].append(int(row['WeakTeamID']))
         
predicted_bracket = pandas.DataFrame(predicted_bracket)
predicted_bracket = predicted_bracket.merge(teams[['TeamID', 'TeamName']], left_on='WTeamID', right_on='TeamID')
predicted_bracket = predicted_bracket.rename(index=str, columns={"TeamName": "WTeamName"}).drop('TeamID', axis=1)
predicted_bracket = predicted_bracket.merge(teams[['TeamID', 'TeamName']], left_on='LTeamID', right_on='TeamID')
predicted_bracket = predicted_bracket.rename(index=str, columns={"TeamName": "LTeamName"}).drop('TeamID', axis=1)
predicted_bracket = predicted_bracket.merge(seeds[['TeamID', 'Seed']], left_on='WTeamID', right_on='TeamID')
predicted_bracket = predicted_bracket.rename(index=str, columns={"Seed": "WSeed"}).drop('TeamID', axis=1)
predicted_bracket = predicted_bracket.merge(seeds[['TeamID', 'Seed']], left_on='LTeamID', right_on='TeamID')
predicted_bracket = predicted_bracket.rename(index=str, columns={"Seed": "LSeed"}).drop('TeamID', axis=1)

upsets = (predicted_bracket['WTeamID'] == predicted_bracket['WeakTeamID'])
predicted_bracket['PredictedUpset'] = False
predicted_bracket.loc[upsets, 'PredictedUpset'] = True

pcols = ['Round', 'Slot', 'WSeed', 'WTeamID', 'WTeamName', 'LSeed', 'LTeamID', 'LTeamName', 'PredictedUpset']
predicted_bracket[predicted_bracket['Round'] == 5][pcols].sort_values(['Round', 'Slot'])

Unnamed: 0,Round,Slot,WSeed,WTeamID,WTeamName,LSeed,LTeamID,LTeamName,PredictedUpset
26,5,R5WX,X02,1153,Cincinnati,W01,1462,Xavier,True
40,5,R5YZ,Y01,1437,Villanova,Z03,1277,Michigan St,False


In [6]:
ncaa_2018 = pandas.read_excel('results_2018.xlsx', sheet_name='slots_2018')

ncaa_2018['Round'] = 0
for i in range(1, 6+1):
    games = ncaa_2018['Slot'].str.contains('R{}'.format(i))
    ncaa_2018.loc[games, 'Round'] = i

upsets = (ncaa_2018['WTeamID'] == ncaa_2018['WeakTeamID'])
ncaa_2018['Upset'] = False
ncaa_2018.loc[upsets, 'Upset'] = True
    
ncaa_2018 = ncaa_2018.merge(teams[['TeamID', 'TeamName']], left_on='WTeamID', right_on='TeamID')
ncaa_2018 = ncaa_2018.rename(index=str, columns={"TeamName": "WTeamName"}).drop('TeamID', axis=1)
ncaa_2018 = ncaa_2018.merge(teams[['TeamID', 'TeamName']], left_on='LTeamID', right_on='TeamID')
ncaa_2018 = ncaa_2018.rename(index=str, columns={"TeamName": "LTeamName"}).drop('TeamID', axis=1)
ncaa_2018 = ncaa_2018.merge(seeds[['TeamID', 'Seed']], left_on='WTeamID', right_on='TeamID')
ncaa_2018 = ncaa_2018.rename(index=str, columns={"Seed": "WSeed"}).drop('TeamID', axis=1)
ncaa_2018 = ncaa_2018.merge(seeds[['TeamID', 'Seed']], left_on='LTeamID', right_on='TeamID')
ncaa_2018 = ncaa_2018.rename(index=str, columns={"Seed": "LSeed"}).drop('TeamID', axis=1)

acols = ['Round', 'Slot', 'WSeed', 'WTeamID', 'WTeamName', 'LSeed', 'LTeamID', 'LTeamName', 'Upset']
ncaa_2018[ncaa_2018['Round'] == 5][acols + ['Projected']].sort_values(['Round', 'Slot'])

Unnamed: 0,Round,Slot,WSeed,WTeamID,WTeamName,LSeed,LTeamID,LTeamName,Upset,Projected
9,5,R5WX,W01,1462,Xavier,X16,1420,UMBC,False,True
41,5,R5YZ,Y01,1437,Villanova,Z01,1242,Kansas,False,True


In [23]:
weights = {
    0: 0,
    1: 1,
    2: 2,
    3: 4,
    4: 8,
    5: 16,
    6: 32
}

in_progress_results = predicted_bracket.merge(ncaa_2018[ncaa_2018['Projected'] == False], on='Slot', suffixes=('', '_actual'), how='left')

played = ~in_progress_results['StrongTeamScore'].isnull()
games = (in_progress_results['WTeamID'] == in_progress_results['WTeamID_actual']) & played
in_progress_results.loc[games, 'Correct'] = True
games = (in_progress_results['WTeamID'] != in_progress_results['WTeamID_actual']) & played
in_progress_results.loc[games, 'Correct'] = False

in_progress_results['PointsEarned'] = (in_progress_results['Round'].apply(lambda x: weights[x]))
in_progress_results.loc[in_progress_results['Correct'] != True, 'PointsEarned'] = 0

in_progress_results.loc[in_progress_results['Round'].isin([0, 1]), 'WTeamPossible'] = True
teams_remaining = in_progress_results.loc[(in_progress_results['Correct'] != False) & \
                                          (in_progress_results['Round'] == 1), 'WTeamID']
for rnd in range(2, 6+1):
    in_progress_results.loc[in_progress_results['Round'] == rnd, 'WTeamPossible'] = \
        in_progress_results['WTeamID'].isin(teams_remaining)

    teams_remaining = in_progress_results.loc[(in_progress_results['Correct'] != False) & \
                                              (in_progress_results['Round'] == rnd) & \
                                              (in_progress_results['WTeamPossible'] == True), 'WTeamID']
                                               
in_progress_results['PointsPossible'] = (in_progress_results['Round'].apply(lambda x: weights[x]))
in_progress_results.loc[in_progress_results['WTeamPossible'] != True, 'PointsPossible'] = 0

in_progress_results['PointsPerfect'] = (in_progress_results['Round'].apply(lambda x: weights[x]))


# total points possible = total earned in games played + total still possible in games not played

total_earned = 0
total_points_possible = 0
print '\t'.join(['Round', 'Pts_Earned', 'Pts_Possible', 'Tot_Earned', 'Tot_Possible'])
for rnd in range(1, 6+1):
    earned   = in_progress_results.loc[played  & (in_progress_results['Round'] == rnd), 'PointsEarned'].sum()
    possible = in_progress_results.loc[~played & (in_progress_results['Round'] == rnd), 'PointsPossible'].sum()
   
    total_earned += earned
    total_points_possible += earned + possible
    
    print ('{}\t'*5).format(rnd, earned, possible, total_earned, total_points_possible)
    
print
    
in_round = (in_progress_results['Round'] == 1)
is_upset = (in_progress_results['Upset'] == True)
is_predicted_upset = (in_progress_results['PredictedUpset'] == True)
predicted_eq_actual = (in_progress_results['PredictedUpset'] == in_progress_results['Upset']) 

print 'predicted upsets:',           (in_round & is_predicted_upset).sum()
print 'actual upsets:',              (in_round & is_upset).sum()
print 'correctly predicted upsets:', (in_round & is_upset & predicted_eq_actual).sum()

Round	Pts_Earned	Pts_Possible	Tot_Earned	Tot_Possible
1	23	0	23	23	
2	0	30	23	53	
3	0	28	23	81	
4	0	32	23	113	
5	0	32	23	145	
6	0	32	23	177	

predicted upsets: 2
actual upsets: 9
correctly predicted upsets: 1


predicted upsets: 2
actual upsets: 9
correctly predicted upsets: 1
