In [1]:
import pandas

In [2]:
input_file = 'ms_to_kg_team_map.csv'
ms_kg_teams = pandas.read_csv(input_file, sep='|').drop(['Unnamed: 0', 'index'], axis=1)

ms_kg_teams[:5]

Unnamed: 0,name,team,Season,TeamName,TeamID
0,Abilene_Chr,1,2014,Abilene Chr,1101
1,Abilene_Chr,1,2015,Abilene Chr,1101
2,Abilene_Chr,1,2016,Abilene Chr,1101
3,Abilene_Chr,1,2017,Abilene Chr,1101
4,Abilene_Chr,1,2018,Abilene Chr,1101


In [3]:
seasons_file = 'data/kaggle_2018/DataFiles/Seasons.csv'
seasons = pandas.read_csv(seasons_file)

NCAA_games_file = 'data/kaggle_2018/DataFiles/NCAATourneyCompactResults.csv'
NCAA_games = pandas.read_csv(NCAA_games_file)

seasons[['Season', 'DayZero']][:5]

Unnamed: 0,Season,DayZero
0,1985,10/29/1984
1,1986,10/28/1985
2,1987,10/27/1986
3,1988,11/2/1987
4,1989,10/31/1988


In [4]:
from datetime import datetime, timedelta

season_dates = NCAA_games[['Season', 'DayNum']].groupby(['Season']).min().reset_index()
season_dates = season_dates.merge(seasons[['Season', 'DayZero']], on='Season')
season_dates['DayZero'] = pandas.to_datetime(season_dates['DayZero'])
season_dates['DayNum'] = pandas.to_timedelta(season_dates['DayNum'], unit='D')
season_dates['NCAA_start'] = season_dates['DayZero'] + season_dates['DayNum']  
season_dates = season_dates.drop('DayNum', axis=1)

season_dates = season_dates.append(pandas.DataFrame({'Season': [2018], 
                                                     'DayZero': datetime(2017, 11, 1), 
                                                     'NCAA_start': datetime(2018, 3, 13)
                                                     }))

season_dates[-5:]

Unnamed: 0,DayZero,NCAA_start,Season
29,2013-11-04,2014-03-18,2014
30,2014-11-03,2015-03-17,2015
31,2015-11-02,2016-03-15,2016
32,2016-10-31,2017-03-14,2017
0,2017-11-01,2018-03-13,2018


In [5]:
team_streaks = None

for streak_len in [2, 3, 4, 6, 8, 10]:
    for season in range(2010, 2018+1):
        print streak_len, season

        games = pandas.read_csv('data/game_scores/games_{}.csv'.format(season)).drop('Unnamed: 0', axis=1)
        teams = pandas.read_csv('data/game_scores/teams_{}.csv'.format(season)).drop('Unnamed: 0', axis=1)

        games = games.merge(teams, left_on='team1', right_on='team') \
                     .drop('team', axis=1) \
                     .rename(index=str, columns={'name': 'team1_name'}) \
                     .merge(teams, left_on='team2', right_on='team') \
                     .drop('team', axis=1) \
                     .rename(index=str, columns={'name': 'team2_name'})

        games = games.sort_values('date')

        min_date = int('{}0101'.format(season))

        dt_fmt = '%Y-%m-%d'
        dt = datetime.strptime(str(season_dates[season_dates['Season'] == season]['NCAA_start'].values[0]).split('T')[0], dt_fmt)
        dt_fmt = '%Y%m%d'
        max_dt = min(dt, datetime.now()) - timedelta(days=1)
        max_date = int(max_dt.strftime(dt_fmt))

        for idx, row in ms_kg_teams[ms_kg_teams['Season'] == season].iterrows():
            tm = row['name']

            valid_games = ((games['team1_name'] == tm) | (games['team2_name'] == tm)) & \
                          ((games['date'] >= min_date) & (games['date'] <= max_date))

            team_record = games[valid_games].sort_values('date')

            all_rows = (team_record['date'] > 0)
            team_record.loc[all_rows, 'outcome'] = '-'

            tm_is_team1 = (team_record['team1_name'] == tm)
            tm_is_team2 = (team_record['team2_name'] == tm)
            team1_is_winner = (team_record['team1_score'] > team_record['team2_score'])
            team2_is_winner = (team_record['team1_score'] < team_record['team2_score'])

            outcome = tm_is_team1 & team1_is_winner
            team_record.loc[outcome, 'outcome'] = 1
            outcome = tm_is_team2 & team2_is_winner
            team_record.loc[outcome, 'outcome'] = 1
            outcome = tm_is_team1 & team2_is_winner
            team_record.loc[outcome, 'outcome'] = 0
            outcome = tm_is_team2 & team1_is_winner
            team_record.loc[outcome, 'outcome'] = 0

            n_games = streak_len

            try:
                team_record.loc[all_rows, 'streak'] = team_record['outcome'].rolling(window=n_games).sum() / float(n_games)
            except:
                print tm
                print team_record
                raise

            # print tm, map(list, team_record[team_record['date'] == team_record['date'].max()][['date', 'streak']].values)[0]

            streak_row = team_record[team_record['date'] == team_record['date'].max()][['streak']]
            streak_row['TeamName'] = row['TeamName']
            streak_row['TeamID'] = row['TeamID']
            streak_row['Season'] = row['Season']
            streak_row['StreakLen'] = streak_len

            if team_streaks is None:
                team_streaks = streak_row.copy()
            else:
                team_streaks = team_streaks.append(streak_row.copy())

team_streaks

2 2010
2 2011
2 2012
2 2013
2 2014
2 2015
2 2016
2 2017
2 2018
3 2010
3 2011
3 2012
3 2013
3 2014
3 2015
3 2016
3 2017
3 2018
4 2010
4 2011
4 2012
4 2013
4 2014
4 2015
4 2016
4 2017
4 2018
6 2010
6 2011
6 2012
6 2013
6 2014
6 2015
6 2016
6 2017
6 2018
8 2010
8 2011
8 2012
8 2013
8 2014
8 2015
8 2016
8 2017
8 2018
10 2010
10 2011
10 2012
10 2013
10 2014
10 2015
10 2016
10 2017
10 2018


Unnamed: 0,streak,TeamName,TeamID,Season,StreakLen
4915,0.5,Air Force,1102,2010,2
1184,0.5,Akron,1103,2010,2
2374,0.5,Alabama,1104,2010,2
2517,0.5,Alabama A&M,1105,2010,2
2883,0.5,Alabama St,1106,2010,2
480,0.0,Albany NY,1107,2010,2
5366,0.5,Alcorn St,1108,2010,2
3993,0.5,American Univ,1110,2010,2
4951,0.5,Appalachian St,1111,2010,2
1143,0.5,Arizona,1112,2010,2


In [6]:
output_file = 'composite_streak_data.csv'
team_streaks.to_csv(output_file, sep='|')

In [7]:
team_streaks

Unnamed: 0,streak,TeamName,TeamID,Season,StreakLen
4915,0.5,Air Force,1102,2010,2
1184,0.5,Akron,1103,2010,2
2374,0.5,Alabama,1104,2010,2
2517,0.5,Alabama A&M,1105,2010,2
2883,0.5,Alabama St,1106,2010,2
480,0.0,Albany NY,1107,2010,2
5366,0.5,Alcorn St,1108,2010,2
3993,0.5,American Univ,1110,2010,2
4951,0.5,Appalachian St,1111,2010,2
1143,0.5,Arizona,1112,2010,2
