In [None]:
def get_def_off(df):
    #Get possessions
    df['Pos'] = df.apply(lambda row: 0.96*(row.Team1_FGA + row.Team1_TO + 0.44*row.Team1_FTA - row.Team1_OR), axis=1)
    #Offensive efficiency (OffRtg) = 100 x (Points / Possessions)
    df['OffRtg'] = df.apply(lambda row: 100 * (row.Team1_score / row.Pos), axis=1)
    #Defensive efficiency (DefRtg) = 100 x (Opponent points / Opponent possessions)
    df['DefRtg'] = df.apply(lambda row: 100 * (row.Team2_score / row.Pos), axis=1)
    df.drop('Pos', axis = 1)
    return df


In [1]:
def get_cols_for_both_teams(df, cols):
    
    df1 = df.copy()
    df2 = df.copy()
    
    df1.rename(columns = {col: 'Team1_' + col for col in cols}, inplace = True)
    df2.rename(columns = {'Team2': 'Team1', 'Team1': 'Team2'}, inplace = True)
    
    df = df1.merge(df2[cols + ['Season', 'DayNum', 'Team2']],
                        left_on = ['Season', 'DayNum', 'Team2'], 
                        right_on =  ['Season', 'DayNum', 'Team2'], how = 'left')
    
    df.rename(columns = {col: 'Team2_' + col for col in cols}, inplace = True)
    
    return df 


In [None]:
#Generate predicted oe/de features -- Only for final prediction 
def get_oe_de_features(tourney_games, season_games):
    
    #Add home/away/N
    tourney_games['H'] = 0
    tourney_games['A'] = 0
    tourney_games['N'] = 1
    
    season_end = season_games.groupby(['Team1', 'Season'])['DayNum'].max().reset_index()
    eoy_features = season_games.merge(season_end, on = ['Team1', 'Season', 'DayNum'])
    
    #Join in season end features for linear regression
    cols = ['Team1', 'Season','avg_oe', 'avg_de','avg_opp_avg_oe', 'avg_opp_avg_de',
                        'avg_opp_avg_opp_avg_oe','avg_opp_avg_opp_avg_de', 
                        'avg_opp_avg_opp_avg_opp_avg_oe', 'avg_opp_avg_opp_avg_opp_avg_de']   

    #Get Team1 Metrics
    tourney_games = tourney_games.merge(eoy_features[cols], on = ['Team1', 'Season'])

    tourney_games.rename(columns = {'avg_oe': 't1_avg_oe', 
                                    'avg_de': 't1_avg_de',
                                    'avg_opp_avg_oe': 't1_avg_opp_avg_oe', 
                                    'avg_opp_avg_de': 't1_avg_opp_avg_de',
                                    'avg_opp_avg_opp_avg_oe': 't1_avg_opp_avg_opp_avg_oe',
                                    'avg_opp_avg_opp_avg_de': 't1_avg_opp_avg_opp_avg_de',
                                    'avg_opp_avg_opp_avg_opp_avg_oe': 't1_avg_opp_avg_opp_avg_opp_avg_oe',
                                    'avg_opp_avg_opp_avg_opp_avg_de': 't1_avg_opp_avg_opp_avg_opp_avg_de'}, \
                                    inplace = True)


    tourney_games = tourney_games.merge(eoy_features[cols], left_on = ['Team2', 'Season'], right_on = ['Team1', 'Season'])                                    


    tourney_games.rename(columns = {'Team1_x':'Team1',
                                    'avg_oe': 't2_avg_oe', 
                                    'avg_de': 't2_avg_de',
                                    'avg_opp_avg_oe': 't2_avg_opp_avg_oe', 
                                    'avg_opp_avg_de': 't2_avg_opp_avg_de',
                                    'avg_opp_avg_opp_avg_oe': 't2_avg_opp_avg_opp_avg_oe',
                                    'avg_opp_avg_opp_avg_de': 't2_avg_opp_avg_opp_avg_de',
                                    'avg_opp_avg_opp_avg_opp_avg_oe': 't2_avg_opp_avg_opp_avg_opp_avg_oe',
                                    'avg_opp_avg_opp_avg_opp_avg_de': 't2_avg_opp_avg_opp_avg_opp_avg_de'}, \
                                    inplace = True)
    
    reg_feature_list_T1 = ['H', 'A', 'N','t1_avg_oe', 't1_avg_de','t1_avg_opp_avg_oe', 't1_avg_opp_avg_de',
                    't1_avg_opp_avg_opp_avg_oe','t1_avg_opp_avg_opp_avg_de', 
                    't1_avg_opp_avg_opp_avg_opp_avg_oe', 't1_avg_opp_avg_opp_avg_opp_avg_de',         
                    't2_avg_oe', 't2_avg_de', 't2_avg_opp_avg_oe', 't2_avg_opp_avg_de', 
                   't2_avg_opp_avg_opp_avg_oe', 't2_avg_opp_avg_opp_avg_de']


    reg_feature_list_T2 = ['H', 'A', 'N','t2_avg_oe', 't2_avg_de','t2_avg_opp_avg_oe', 't2_avg_opp_avg_de',
                    't2_avg_opp_avg_opp_avg_oe','t2_avg_opp_avg_opp_avg_de', 
                    't2_avg_opp_avg_opp_avg_opp_avg_oe', 't2_avg_opp_avg_opp_avg_opp_avg_de',         
                    't1_avg_oe', 't1_avg_de', 't1_avg_opp_avg_oe', 't1_avg_opp_avg_de', 
                   't1_avg_opp_avg_opp_avg_oe', 't1_avg_opp_avg_opp_avg_de']


    #Get predicted OE and DE
    tourney_games['t1_pred_oe'] = oe_model.predict(tourney_games[reg_feature_list_T1].values)
    tourney_games['t1_pred_de'] = de_model.predict(tourney_games[reg_feature_list_T1].values)
    tourney_games['t2_pred_oe'] = oe_model.predict(tourney_games[reg_feature_list_T2].values)
    tourney_games['t2_pred_de'] = de_model.predict(tourney_games[reg_feature_list_T2].values)

    return tourney_games
    

In [None]:
def get_coachexp(df, team_coaches):
    
    cols = ['Season','TeamID', 'prevSeasons']
    df = df.merge(team_coaches[cols], left_on = ['Season','Team1'], \
                                right_on = ['Season','TeamID'], how = 'left')

    df.rename(columns = {'prevSeasons': 'Team1CoachExp'}, inplace = True)

    df = df.merge(team_coaches[cols], left_on = ['Season','Team2'], \
                                right_on = ['Season','TeamID'], how = 'left')

    df.rename(columns = {'prevSeasons': 'Team2CoachExp'}, inplace = True)
    
    return df 

In [None]:
def get_seed_diff(tourney_games, seeds_df):
    
    tourney_games = tourney_games.merge(seeds_df, left_on = ['Season', 'Team1'], right_on = ['Season', 'TeamID'], how = 'left')
    tourney_games.rename(columns = {'Seed': 'Team1Seed'}, inplace = True)
    tourney_games = tourney_games.merge(seeds_df, left_on = ['Season', 'Team2'], right_on = ['Season', 'TeamID'], how = 'left')
    tourney_games.rename(columns = {'Seed': 'Team2Seed'}, inplace = True)
    tourney_games['Team1Seed'] = tourney_games['Team1Seed'].apply(lambda x: re.sub('[^0-9]','', x)).apply(int)
    tourney_games['Team2Seed'] = tourney_games['Team2Seed'].apply(lambda x: re.sub('[^0-9]','', x)).apply(int)
    tourney_games['SeedDiff'] = tourney_games['Team1Seed'] - tourney_games['Team2Seed']
    
    return tourney_games
    
    

In [None]:
def get_power_conf(tourney_games, power_conf):
    
    #Add Conferences
    team_conferences['PowerConf'] = np.where(team_conferences['ConfAbbrev'].isin(['acc','sec','big_twelve', \
                                                                'pac_twelve', 'big_east', 'pac_ten', \
                                                                 'big_ten']), \
                                                                  1, 0)
    
    tourney_games = tourney_games.merge(power_conf, left_on = ['Season', 'Team1'], right_on = ['Season', 'TeamID'], how = 'left')
    tourney_games.rename(columns = {'PowerConf': 'Team1PC'}, inplace = True)
    tourney_games = tourney_games.merge(power_conf, left_on = ['Season', 'Team2'], right_on = ['Season', 'TeamID'], how = 'left')
    tourney_games.rename(columns = {'PowerConf': 'Team2PC'}, inplace = True)
    
    
    #tourney_games['SeedConf'] = np.where(tourney_games['SeedDiff'] < 0, '1' + tourney_games['Team1PC'].map(str), \
     #                          '0' + tourney_games['Team1PC'].map(str))
    
    tourney_games['SeedConf'] = np.where( (tourney_games['SeedDiff'] >= 0) & (tourney_games['Team1PC'] == 0), 1,
                            np.where((tourney_games['SeedDiff'] >= 0) & (tourney_games['Team1PC'] == 1), 2,
                            np.where((tourney_games['SeedDiff'] < 0) & (tourney_games['Team1PC'] == 0), 3,4)))
                                        
    
    return tourney_games


In [None]:
def get_ordinal_movement(tourney_games, ranking):
    
    cols = ['Season', 'TeamID','change10', 'conv_rtg']
    tourney_games = tourney_games.merge(ranking[cols], left_on = ['Season', 'Team1'], \
                                        right_on = ['Season', 'TeamID'], how = 'left')
    tourney_games.rename(columns = {'change10': 'Team1Change10', 'conv_rtg': 'Team1conv_rtg'}, inplace = True)
    tourney_games = tourney_games.merge(ranking[cols], left_on = ['Season', 'Team2'], \
                                        right_on = ['Season', 'TeamID'], how = 'left')
    tourney_games.rename(columns = {'change10': 'Team2Change10', 'conv_rtg': 'Team2conv_rtg'}, inplace = True)
    
    tourney_games['rating_diff'] = tourney_games.Team1conv_rtg - tourney_games.Team2conv_rtg
    tourney_games['conv_rtg_pred'] = 1 / (1 + 10**(-tourney_games['rating_diff'] / 12))
    
    return tourney_games
    

In [None]:
def get_margin_features(tourney_games, ranking, all_games, goodRankThresh = 25, badRankThresh = 50, goodMargin = -7,
                       badMargin = 10):
    
    #Add ranking to the season
    cols = ['Season', 'TeamID','OrdinalRank']
    season_games = all_games[all_games.TourneyGame == 0]
    df = season_games.merge(ranking[cols], left_on = ['Season', 'Team1'], \
                                        right_on = ['Season', 'TeamID'], how = 'left')
    df.rename(columns = {'OrdinalRank': 'Team1Rank'}, inplace = True)
    df = df.merge(ranking[cols], left_on = ['Season', 'Team2'], \
                                        right_on = ['Season', 'TeamID'], how = 'left')
    df.rename(columns = {'OrdinalRank': 'Team2Rank'}, inplace = True)
    
    
    #Get margin features 
    df['Margin'] = df['Team1_score'] - df['Team2_score']
    df['GoodWin'] = np.where((df.Margin > goodMargin) & (df.Team2Rank <= goodRankThresh), 1, 0)
    df['BadLoss'] = np.where((df.Margin < badMargin) & (df.Team2Rank >= badRankThresh), 1, 0)
    margin_set = df.groupby(['Team1', 'Season']).agg({'GoodWin': np.sum, 'BadLoss': np.sum}).reset_index()
    
    #Join back to tourney games
    tourney_games = tourney_games.merge(margin_set, on = ['Season', 'Team1'], how = 'left')
    tourney_games.rename(columns = {'GoodWin': 'Team1GoodWin' + str(goodRankThresh),
                                   'BadLoss': 'Team1BadLoss' + str(badRankThresh)}, inplace = True)
    
    margin_set.rename(columns = {'Team1': 'Team2', 'Team2': 'Team1'}, inplace = True)
    tourney_games = tourney_games.merge(margin_set, on = ['Season', 'Team2'], how = 'left')
    tourney_games.rename(columns = {'GoodWin': 'Team2GoodWin' + str(goodRankThresh),
                                   'BadLoss': 'Team2BadLoss' + str(badRankThresh)}, inplace = True)
    
    return tourney_games


In [None]:
def get_team_names(tourney_games, teams):
    
    cols = ['TeamID', 'TeamName']
    
    tourney_games = tourney_games.merge(teams, left_on = ['Team1'], \
                                        right_on = ['TeamID'], how= 'left')
    tourney_games.rename(columns = {'TeamName': 'Team1Name'}, inplace = True)
    
    tourney_games = tourney_games.merge(teams, left_on = ['Team2'], \
                                        right_on = ['TeamID'], how= 'left')
    tourney_games.rename(columns = {'TeamName': 'Team2Name'}, inplace = True)
    
    return tourney_games
    

In [None]:
def add_pagerank(tourney_games, page_rank_df):

    tourney_games = tourney_games.merge(page_rank_df, left_on = ['Team1', 'Season'], \
                                        right_on = ['TeamID', 'Season'], how= 'left')
    tourney_games.rename(columns = {'Rank': 'Team1PRank'}, inplace = True)
    
    tourney_games = tourney_games.merge(page_rank_df, left_on = ['Team2', 'Season'], \
                                        right_on = ['TeamID', 'Season'], how= 'left')
    tourney_games.rename(columns = {'Rank': 'Team2PRank'}, inplace = True)
    
    return tourney_games
    

In [None]:
def get_conf_tourney(tourney_games, conf_games):
    
    tourney_games = tourney_games.merge(conf_games, left_on = ['Team1', 'Season'], \
                                        right_on = ['TeamID', 'Season'], how= 'left')
    tourney_games.rename(columns = {'ct_wins': 'Team1ctw'}, inplace = True)
    
    tourney_games = tourney_games.merge(conf_games, left_on = ['Team2', 'Season'], \
                                        right_on = ['TeamID', 'Season'], how= 'left')
    tourney_games.rename(columns = {'ct_wins': 'Team2ctw'}, inplace = True)
    
    return tourney_games


In [None]:
def add_pom_raw(tourney_games, pom_data):
    
    cols = ['TeamID', 'adj_em', 'Season']
    
    tourney_games = tourney_games.merge(pom_data, left_on = ['Team1', 'Season'], \
                                        right_on = ['TeamID', 'Season'], how= 'left')
    tourney_games.rename(columns = {'adj_em': 'Team1adjem'}, inplace = True)
    
    tourney_games = tourney_games.merge(pom_data, left_on = ['Team2', 'Season'], \
                                        right_on = ['TeamID', 'Season'], how= 'left')
    tourney_games.rename(columns = {'adj_em': 'Team2adjem'}, inplace = True)
    
    return tourney_games

In [None]:
def add_features(df1, df2, left_on, right_on, feature_col, suffix, how = 'left'):
    
    select_cols = right_on + [feature_col]
    df1 = df1.merge(df2[select_cols], left_on = left_on, right_on = right_on)
    df1.rename(columns = {feature_col: feature_col + suffix}, inplace = True)
    
    for col in right_on:
        if col not in left_on:
            df1.drop(col, axis = 1, inplace = True)
    
    return df1

    