In [1]:
%matplotlib inline



In [2]:
import os
import pandas
import numpy
import sklearn
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor


In [3]:
# Map team names to Kaggle team IDs
input_file_name = 'data/kaggle_2018/DataFiles/Teams.csv'
teams = pandas.read_csv(input_file_name)
teams[:5]

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2018
1,1102,Air Force,1985,2018
2,1103,Akron,1985,2018
3,1104,Alabama,1985,2018
4,1105,Alabama A&M,2000,2018


In [4]:
data_path = os.path.realpath('data/prepared_data')
data_file_name = 'combined_rankers.csv'
data_full_path = os.path.join(data_path, data_file_name)

historical_ranker_list_file_name = 'all_valid_rankers.txt'

with open(os.path.join(data_path, historical_ranker_list_file_name), 'r') as input_file:
    historical_ranker_list = map(lambda s: s.strip(), input_file.readlines())

schema = {'name': str,
          'label': float,
          'date': int
          }

schema.update(dict([(rnk, float) for rnk in historical_ranker_list]))

data = pandas.read_csv(data_full_path,
                       header=0,
                       dtype=schema) \
             .drop('Unnamed: 0', axis=1)
    

In [5]:

combined_df = None

for season in range(2010, 2018+1):
    print season
    
    if season == 2018:
        #latest_rankings_file_name = 'data/massey_2018/rankings_through_20180304.clean.csv'
        latest_rankings_file_name = 'data/massey_2018/rankings_through_20180311.clean.csv'
        latest_rankings = pandas.read_csv(latest_rankings_file_name).drop('Unnamed: 0', axis=1)
    else:
        latest_rankings_file_name = 'data/rankings/{}_composite_rankings.clean.csv'.format(season)
        latest_rankings = pandas.read_csv(latest_rankings_file_name, sep='|').drop('Unnamed: 0', axis=1)

    ranker_list = list(set(historical_ranker_list).intersection(set(latest_rankings.columns)))

    regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators=500)

    X = data[ranker_list]
    y = data['label']

    regr.fit(X, y)

    latest_model_rank = regr.predict(latest_rankings[ranker_list])
    model_rank = latest_rankings[['Team']].join(pandas.DataFrame({'rank': latest_model_rank}))

    team_name_map = {'UC Santa Barbara': 'Santa Barbara',
                     'Cal Poly': 'Cal Poly SLO',
                     'SIUE': 'Edwardsville',
                     'W Kentucky': 'WKU',
                     'Middle Tenn St': 'MTSU',
                     'N Arizona': 'Northern Arizona',
                     'IUPU Ft Wayne': 'IPFW',
                     'Utah Val St': 'Utah Valley',
                     'TX Pan American': 'UTRGV',
                     'MD Baltimore Co': 'UMBC'
                     }
    
    for name, replacement in team_name_map.iteritems():
        model_rank.loc[model_rank['Team'] == name, 'Team'] = replacement

    df = model_rank.merge(teams[['TeamID', 'TeamName']], left_on='Team', right_on='TeamName', how='left') \
                   .drop('TeamName', axis=1) \
                   .sort_values('rank')

    # double check that we captured all of the teams
    if df[df['TeamID'].isnull()].shape[0] > 0:
        print 'Teams missing TeamID...'
        print df[df['TeamID'].isnull()]

    z_min = df['rank'][:64].min()
    z_max = df['rank'][:64].max()
    z = z_max - z_min

    df['norm_rank'] = ((df['rank'] - z_min)/z)
    
    df['Season'] = season

    if combined_df is None:
        combined_df = df.copy()
    else:
        combined_df = combined_df.append(df.copy())

2010
2011
2012
2013
2014
2015
2016
2017
2018


In [10]:
combined_df[combined_df['Season'] == 2018][:8]

Unnamed: 0,Team,rank,TeamID,norm_rank,Season,noise
0,Villanova,10.388889,1437,0.0,2018,-0.020899
1,Virginia,10.388889,1438,0.0,2018,0.128211
3,Cincinnati,10.666667,1153,0.00511,2018,-0.008366
5,Michigan St,11.0,1277,0.011243,2018,-0.467991
6,Kansas,11.25,1242,0.015842,2018,-0.105488
2,Duke,12.0,1181,0.02964,2018,-0.46272
7,North Carolina,12.5,1314,0.038838,2018,-0.373332
4,Purdue,12.75,1345,0.043437,2018,-0.126075


In [7]:
# In case some teams are missing, here's some code to search for modified spellings
for tn in teams[['TeamName']].sort_values('TeamName').iterrows():
    x = tn[1]['TeamName']
    if x.lower().find('IUP'.lower()) > -1:
        print x


IUPUI


In [8]:
bracket_file_name = 'bracket.poll_of_polls.csv'
full_bracket_file_name = 'full_bracket.poll_of_polls.csv'

# break ties with some scaled noise
combined_df['noise'] = numpy.random.random(combined_df.shape[0]) - 0.5

# df[['TeamID', 'rank']].to_csv(bracket_file_name, header=False, sep=' ', index=False)
combined_df.to_csv(full_bracket_file_name, header=True, sep='|', index=False)