In [1]:
import pandas as pd
import pickle

### Import data

In [2]:
gl2groll = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/gamelog_data_2_game_rolling.pkl')
gl3groll = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/gamelog_data_3_game_rolling.pkl')
gl4groll = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/gamelog_data_4_game_rolling.pkl')
gl5groll = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/gamelog_data_5_game_rolling.pkl')
gl6groll = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/gamelog_data_6_game_rolling.pkl')
gl7groll = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/gamelog_data_7_game_rolling.pkl')
team_exp = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/team_experience.pkl')
team_clust = pd.read_pickle('/Users/sec/galvanize/bracket_buster/data/team_clusters.pkl')

### Functions

In [3]:
def matchup_unique_id(row):
    '''
    Create matchup and ID rows
    '''
    row['matchup'] = ",".join(sorted([row['Tm'], row['Opp']]))
    row['ID'] = '{},{}'.format(row['matchup'], row['Date'])
    return row

In [4]:
def matchup_merge_original(df):
    '''
    INPUT: DataFrame
    OUTPUT: DataFrame with matching IDs merged to same row (1 game per row!)
    '''

    '''Add Unique ID for Merge'''
    df = df.apply(matchup_unique_id, axis=1)

    '''Add cumulative conditional count column'''
    df['count'] = df.groupby('ID').cumcount() + 1

    '''Create separate dataframes for 1st and 2nd instances of games'''
    df1 = df[df['count'] == 1]
    df2 = df[df['count'] == 2]

    '''Drop unneeded columns from 2nd game instance DataFrame and
    rename te prepare for pending left merge'''
    df2 = df2.drop(['Date', 'Opp', 'W', 'GameType', 'Ws', 'matchup', 'count'], axis=1)
    g2cols = df2.columns.tolist()
    OPcols = ['OP{}'.format(col) if col != 'ID' else col for col in g2cols]
    df2.columns = OPcols

    '''Merge games instance DataFrames'''
    df = pd.merge(df1, df2, how='left', on='ID')

    '''Drop redundant Opp column and any games where there is no data
    for oppenent'''
    df = df.drop(['Date', 'Ws', 'Opp', 'count', 'ID', 'count', 'matchup', 'Tm', 'OPTm'], axis=1) #'just_date',
    df = df.dropna()

    return df

In [5]:
def gamelog_ID(row):
    row['Season'] = row['GameType'][-4:]
    row['ID'] = ",".join([row['Tm'], str(row['Season'])])
    return row

In [6]:
def ID(row):
    row['ID'] = ",".join([row['Team'], str(row['Season'])])
    return row

In [19]:
def experience_merge(gamelog_df, experience_df):
    '''
    INPUT: Gamelog DataFrame and experience DataFrame
    OUTPUT: DataFrame with matching IDs merged to same row (1 game per row!)
    '''
    
    '''Generate ID for merge'''
    gamelog_df = gamelog_df.apply(gamelog_ID, axis=1)
    experience_df = experience_df.apply(ID, axis=1)
    
    '''Drop Season columns generated with ID creation'''
    gamelog_df.drop(['Season'], axis=1, inplace=True)
    experience_df.drop(['Season', 'Team'], axis=1, inplace=True)

    '''merge experience DataFrame into gamelog DataFrame'''
    df = gamelog_df.merge(experience_df, on='ID', how='left')
    
    '''Add Unique ID for Matchup Merge'''
    df = df.apply(matchup_unique_id, axis=1)
    
    '''Add cumulative conditional count column'''
    df['count'] = df.groupby('ID').cumcount() + 1

    '''Create separate dataframes for 1st and 2nd instances of games'''
    df1 = df[df['count'] == 1]
    df2 = df[df['count'] == 2]

    '''Drop unneeded columns from 2nd game instance DataFrame and
    rename te prepare for pending left merge'''
    df2 = df2.drop(['Date', 'Opp', 'W', 'GameType', 'Ws', 'matchup', 'count'], axis=1)
    g2cols = df2.columns.tolist()
    OPcols = ['OP{}'.format(col) if col != 'ID'  else col for col in g2cols]
    df2.columns = OPcols

    '''Merge games instance DataFrames'''
    df = pd.merge(df1, df2, how='left', on='ID')

    '''Drop redundant Opp column and any games where there is no data
    for oppenent'''
    df = df.drop(['Date', 'Ws', 'Opp', 'count', 'ID', 'count', 'matchup', 'Tm', 'OPTm'], axis=1) #'just_date',
    df = df.dropna()

    return df

In [21]:
def cluster_experience_merge(gamelog_df, experience_df, cluster_df):
    '''
    INPUT: Gamelog DataFrame and experience DataFrame
    OUTPUT: DataFrame with matching IDs merged to same row (1 game per row!)
    '''
    
    '''Generate ID for merge'''
    gamelog_df = gamelog_df.apply(gamelog_ID, axis=1)
    experience_df = experience_df.apply(ID, axis=1)
    cluster_df = cluster_df.apply(ID, axis=1)
    
    '''Drop Season columns generated with ID creation'''
    gamelog_df.drop(['Season'], axis=1, inplace=True)
    experience_df.drop(['Season', 'Team'], axis=1, inplace=True)
    cluster_df.drop(['Season', 'Team'], axis=1, inplace=True)

    '''merge experience DataFrame into gamelog DataFrame'''
    df = gamelog_df.merge(experience_df, on='ID', how='left').merge(cluster_df, on='ID', how='left')
    
    '''Add Unique ID for Matchup Merge'''
    df = df.apply(matchup_unique_id, axis=1)
    
    '''Add cumulative conditional count column'''
    df['count'] = df.groupby('ID').cumcount() + 1

    '''Create separate dataframes for 1st and 2nd instances of games'''
    df1 = df[df['count'] == 1]
    df2 = df[df['count'] == 2]

    '''Drop unneeded columns from 2nd game instance DataFrame and
    rename te prepare for pending left merge'''
    df2 = df2.drop(['Date', 'Opp', 'W', 'GameType', 'Ws', 'matchup', 'count'], axis=1)
    g2cols = df2.columns.tolist()
    OPcols = ['OP{}'.format(col) if col != 'ID'  else col for col in g2cols]
    df2.columns = OPcols

    '''Merge games instance DataFrames'''
    df = pd.merge(df1, df2, how='left', on='ID')

    '''Drop redundant Opp column and any games where there is no data
    for oppenent'''
    df = df.drop(['Date', 'Ws', 'Opp', 'count', 'ID', 'count', 'matchup', 'Tm', 'OPTm'], axis=1) #'just_date',
    df = df.dropna()

    return df

### Plan:
1. gamelog_data with all rolling averages **without** clusters and experience feature _**(check)**_
2. gamelog_data with all rolling averages **with** experience feature but **without** clusters
3. gamelog_data with all rolling averages **with** clusters and experience features

#### 1. gamelog_data with all rolling averages **with** clusters and experience feature _**(done)**_

In [9]:
gl2groll.head()

Unnamed: 0,Date,Opp,W,sos,GameType,Ws,Wp,ppg,pApg,FGp,3Pp,FTp,ORBpg,RBpg,ASTpg,STLpg,BLKpg,TOpg,PFpg,Tm
0,2014-01-09,new-orleans,0,-4.12,season2014,2.0,0.2,57.0,75.5,0.358,0.2835,0.6835,6.0,20.0,10.0,6.5,3.5,8.0,23.0,abilene-christian
1,2014-01-11,southeastern-louisiana,0,-4.12,season2014,6.0,0.4,90.0,68.0,0.5065,0.415,0.82,9.5,37.5,22.0,7.5,1.5,11.5,23.5,abilene-christian
2,2014-01-16,oral-roberts,0,-4.12,season2014,6.0,0.375,79.0,86.0,0.3885,0.336,0.703,9.0,38.5,14.0,7.0,0.5,9.0,27.0,abilene-christian
3,2014-01-18,central-arkansas,1,-4.12,season2014,6.0,0.352941,68.0,83.5,0.3685,0.434,0.7875,8.5,32.5,6.5,5.0,1.5,10.5,19.5,abilene-christian
4,2014-01-23,lamar,0,-4.12,season2014,7.0,0.388889,66.0,77.0,0.438,0.4285,0.8195,7.0,23.5,8.0,7.5,2.0,14.5,17.5,abilene-christian


In [10]:
gl2groll_matchups = matchup_merge_original(gl2groll)
gl3groll_matchups = matchup_merge_original(gl3groll)
gl4groll_matchups = matchup_merge_original(gl4groll)
gl5groll_matchups = matchup_merge_original(gl5groll)
gl6groll_matchups = matchup_merge_original(gl6groll)
gl7groll_matchups = matchup_merge_original(gl7groll)

In [11]:
gl2groll_matchups.head()

Unnamed: 0,W,sos,GameType,Wp,ppg,pApg,FGp,3Pp,FTp,ORBpg,...,OPFGp,OP3Pp,OPFTp,OPORBpg,OPRBpg,OPASTpg,OPSTLpg,OPBLKpg,OPTOpg,OPPFpg
0,0,-4.12,season2014,0.2,57.0,75.5,0.358,0.2835,0.6835,6.0,...,0.4065,0.1385,0.666,14.0,42.0,4.5,8.0,1.5,21.5,29.0
1,0,-4.12,season2014,0.4,90.0,68.0,0.5065,0.415,0.82,9.5,...,0.4455,0.308,0.699,5.5,30.5,8.5,4.0,3.5,12.0,20.5
2,0,-4.12,season2014,0.375,79.0,86.0,0.3885,0.336,0.703,9.0,...,0.459,0.3705,0.689,13.0,38.0,11.0,3.0,5.5,11.0,21.0
3,1,-4.12,season2014,0.352941,68.0,83.5,0.3685,0.434,0.7875,8.5,...,0.4055,0.5,0.6825,9.5,29.0,10.0,5.5,3.5,12.5,24.5
4,0,-4.12,season2014,0.388889,66.0,77.0,0.438,0.4285,0.8195,7.0,...,0.4095,0.375,0.694,9.5,30.5,8.5,5.5,2.0,15.5,19.5


In [12]:
gl2groll_matchups.columns == gl3groll_matchups.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [13]:
gl2groll_matchups.GameType.value_counts()

season2017     2618
season2018     2616
season2016     2614
season2014     2592
season2015     2587
tourney2015      59
tourney2018      52
tourney2016      51
tourney2014      46
tourney2017      42
Name: GameType, dtype: int64

Write to pkl files

In [14]:
# Do in script
# gl2groll_matchups.to_pickle('model_data/matchups_2gameroll.pkl')
# gl3groll_matchups.to_pickle('model_data/matchups_3gameroll.pkl')
# gl4groll_matchups.to_pickle('model_data/matchups_4gameroll.pkl')
# gl5groll_matchups.to_pickle('model_data/matchups_5gameroll.pkl')
# gl6groll_matchups.to_pickle('model_data/matchups_6gameroll.pkl')
# gl7groll_matchups.to_pickle('model_data/matchups_7gameroll.pkl')

#### 2. gamelog_data with all rolling averages **with** experience feature but **without** clusters

In [15]:
gldf = gl2groll.apply(gamelog_ID, axis=1)
gldf.head()

Unnamed: 0,Date,Opp,W,sos,GameType,Ws,Wp,ppg,pApg,FGp,...,ORBpg,RBpg,ASTpg,STLpg,BLKpg,TOpg,PFpg,Tm,Season,ID
0,2014-01-09,new-orleans,0,-4.12,season2014,2.0,0.2,57.0,75.5,0.358,...,6.0,20.0,10.0,6.5,3.5,8.0,23.0,abilene-christian,2014,"abilene-christian,2014"
1,2014-01-11,southeastern-louisiana,0,-4.12,season2014,6.0,0.4,90.0,68.0,0.5065,...,9.5,37.5,22.0,7.5,1.5,11.5,23.5,abilene-christian,2014,"abilene-christian,2014"
2,2014-01-16,oral-roberts,0,-4.12,season2014,6.0,0.375,79.0,86.0,0.3885,...,9.0,38.5,14.0,7.0,0.5,9.0,27.0,abilene-christian,2014,"abilene-christian,2014"
3,2014-01-18,central-arkansas,1,-4.12,season2014,6.0,0.352941,68.0,83.5,0.3685,...,8.5,32.5,6.5,5.0,1.5,10.5,19.5,abilene-christian,2014,"abilene-christian,2014"
4,2014-01-23,lamar,0,-4.12,season2014,7.0,0.388889,66.0,77.0,0.438,...,7.0,23.5,8.0,7.5,2.0,14.5,17.5,abilene-christian,2014,"abilene-christian,2014"


In [16]:
team_exp_test = team_exp.apply(ID, axis=1)
team_exp_test.head()

Unnamed: 0,Team,Season,exp_factor,ID
0,abilene-christian,2014,2.180717,"abilene-christian,2014"
1,abilene-christian,2015,2.768387,"abilene-christian,2015"
2,abilene-christian,2016,2.6672,"abilene-christian,2016"
3,abilene-christian,2017,2.155593,"abilene-christian,2017"
4,abilene-christian,2018,2.977984,"abilene-christian,2018"


In [20]:
emt = experience_merge(gl2groll, team_exp)
emt.columns

Index(['W', 'sos', 'GameType', 'Wp', 'ppg', 'pApg', 'FGp', '3Pp', 'FTp',
       'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg',
       'exp_factor', 'OPsos', 'OPWp', 'OPppg', 'OPpApg', 'OPFGp', 'OP3Pp',
       'OPFTp', 'OPORBpg', 'OPRBpg', 'OPASTpg', 'OPSTLpg', 'OPBLKpg', 'OPTOpg',
       'OPPFpg', 'OPexp_factor'],
      dtype='object')

save to pickle files

#### 3. gamelog_data with all rolling averages **with** clusters and experience features

In [22]:
clm = cluster_experience_merge(gl2groll, team_exp, team_clust)
clm.columns

Index(['W', 'sos', 'GameType', 'Wp', 'ppg', 'pApg', 'FGp', '3Pp', 'FTp',
       'ORBpg', 'RBpg', 'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg',
       'exp_factor', 'C0', 'C1', 'C2', 'F0', 'F1', 'F2', 'G0', 'G1', 'G2',
       'G3', 'OPsos', 'OPWp', 'OPppg', 'OPpApg', 'OPFGp', 'OP3Pp', 'OPFTp',
       'OPORBpg', 'OPRBpg', 'OPASTpg', 'OPSTLpg', 'OPBLKpg', 'OPTOpg',
       'OPPFpg', 'OPexp_factor', 'OPC0', 'OPC1', 'OPC2', 'OPF0', 'OPF1',
       'OPF2', 'OPG0', 'OPG1', 'OPG2', 'OPG3'],
      dtype='object')