In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm.notebook import tqdm

import unittest

# TODO: when rpedicted spreads, can we add a standard deviation for the output, or represent the probability distribution somehow?
# TODO: could create model that predicts probablity from spread, or just slightly modify last layers to predict probabilities

In [None]:
def AB_rename(df):
    '''
    Modifies the data from kaggle to use A/B for the stat prefixes instead of W/L
    '''
    AB_rename_map = {}

    for column in df.columns:
        if column[0] == 'W':
            AB_rename_map[column] = 'A_' + column[1:]
        elif column[0] == 'L':
            AB_rename_map[column] = 'B_' + column[1:]

    return df.rename(columns=AB_rename_map)
    
def get_mirror(df):
    '''
    Returns a mirrored version of the dataframe with A/B teams flipped
    '''
    
    A_cols = [col for col in df.columns if col[0] == 'A' and col != 'A_Loc']
    B_cols = [col for col in df.columns if col[0] == 'B']
    other_cols = [col for col in df.columns if col not in A_cols and col not in B_cols]

    df2 = pd.DataFrame(columns=df.columns)
    df2[other_cols] = df[other_cols]
    df2[A_cols] = df[B_cols]
    df2[B_cols] = df[A_cols]

    mirror_loc = []

    for i in range(len(df2)):
        if df2.loc[i, 'A_Loc'] == 'H':
            mirror_loc.append('A')
        elif df2.loc[i, 'A_Loc'] == 'A':
            mirror_loc.append('H')
        else:
            mirror_loc.append('N')

    df2.loc[:, 'A_Loc'] = mirror_loc
    
    return df2

def get_doubled_data(df):
    '''
    Returns a complete doubling of the data
    '''
    
    df = AB_rename(df)
    df2 = get_mirror(df)
    
    return pd.concat((df, df2)).reset_index(drop=True)

In [None]:
df = pd.read_csv('/kaggle/input/ncaam-march-mania-2021/MDataFiles_Stage2/MRegularSeasonDetailedResults.csv')

df = get_doubled_data(df)
df['A_Win'] = (df['A_Score'] > df['B_Score']).astype(int)

STAT_COLS = [col for col in df.columns if col[0] in 'AB' and 'TeamID' not in col and 'Loc' not in col]

In [None]:
# TODO: compute the FG/3P/FT percentages directly, probably easier for models to work with
# TODO: consider including number of games played in final dataframe
# TODO: consider adding std of stats to give model idea of consistency
# TODO: compute possessions based on formulas provided elsewhere. Consider using both/more of the formulas, seeing which one perfroms best

def get_final_avgs(df, stat_cols=STAT_COLS):
    '''
    Computes the final mean stats for each season and team
    
    These can be used to predict tournament games (if the provided dataframe is all regular season games)
    '''
    
    avg_dict = {}

    for (season, team_id), group in tqdm(df.groupby(['Season', 'A_TeamID'])):
        avg_dict[(season, team_id)] = group[stat_cols].mean()

    # Reset the indices for the output
    avg_df = pd.DataFrame.from_dict(avg_dict, orient='index')
    avg_df.index.set_names(['Season', 'A_TeamID'], inplace=True)
    avg_df.reset_index(inplace=True)
    
    return avg_df

def get_single_per_game_avgs(season, team_id, group, stat_cols):
    '''
    Helper function to compute the mean stats prior to each game for a given season and team
    '''

    avg_dict = {}
    
    sums = np.array([0] * len(stat_cols))
    count = 0

    for idx in group.sort_values('DayNum').index:
        day = group.loc[idx, 'DayNum']
        
        avg_dict[(season, team_id, day)] = sums / count
        
        count += 1
        sums = sums + group.loc[idx, stat_cols].values

    return avg_dict

def get_per_game_avgs(df, stat_cols=STAT_COLS):
    '''
    Computes the mean stats prior to each game for each season and team
    
    These can be used to predict the outcome of any game in the provided dataframe
    The two opponent ids and the day num can be used as a key to join to targets.
    
    Note: Mean stats for the first game is always NaN. Consider removing this for training,
        or performing some sort of imputation (e.g. mean stats)
    '''
    
    avg_dict = {}
    
    for (season, team_id), group in tqdm(df.groupby(['Season', 'A_TeamID'])):
        avg_dict.update(get_single_per_game_avgs(season, team_id, group, stat_cols))

    avg_df = pd.DataFrame.from_dict(avg_dict, orient='index', columns=stat_cols)
    avg_df.index = pd.MultiIndex.from_tuples(avg_df.index)
    avg_df.index.set_names(['Season', 'A_TeamID', 'DayNum'], inplace=True)
    avg_df.reset_index(inplace=True)
    
    return avg_df

In [None]:
def get_season_dfs(df):
    '''
    Returns a dictionary of each group of season stats from the dataframe
    
    This precomputation can be used to speed up queries within the same season
    '''
    
    seasons = {}
    
    for season, group in df.groupby('Season'):
        seasons[season] = group

    return seasons
        
def dict_to_df(input_dict, stat_cols, index_names):
    '''
    Converts a dictionary of tuple: data pairs, where the tuples store the index(s) for the dataframe
    and the data correspond to the given stat columns
    '''
    
    output_df = pd.DataFrame.from_dict(input_dict, orient='index', columns=stat_cols)
    output_df.index = pd.MultiIndex.from_tuples(output_df.index)
    output_df.index.set_names(index_names, inplace=True)
    output_df.reset_index(inplace=True)
    
    return output_df

def compute_opp_avg(team_id, team_data, season_data, stat_cols):
    '''
    Helper function to compute opponent averages
    '''
    
    opps = team_data['B_TeamID']
    opp_games = season_data.loc[(season_data['A_TeamID'].isin(opps))\
        & (season_data['B_TeamID'] != team_id), ['A_TeamID'] + stat_cols]
    
    opp_means = opp_games.groupby('A_TeamID').mean()
    
    opp_counts = team_data.groupby('B_TeamID')[stat_cols].count()
    opp_counts = opp_counts[opp_counts.index.isin(opp_means.index)]
    
    weighted_opp_means = opp_means.mul(opp_counts)
    
    return weighted_opp_means.to_numpy().sum(axis=0) / opp_counts.to_numpy().sum(axis=0)

def get_final_opp_avgs(df, stat_cols=STAT_COLS):
    '''
    Computes the final mean opponent stats for each season and team
    
    These can be used to predict tournament games
    '''

    # Precompute games from each season to speed up future operations
    seasons = get_season_dfs(df)

    opp_avg_dict = {}

    for (team_id, season), team_season in tqdm(df.groupby(['A_TeamID', 'Season'])):
        season_data = seasons[season] 

        opp_avg_dict[(season, team_id)] = compute_opp_avg(team_id, team_season, season_data, stat_cols)
        
    return dict_to_df(opp_avg_dict, stat_cols, ['Season', 'A_TeamID'])

def get_single_per_game_opp_avgs(season, team_id, group, season_stats, stat_cols):
    '''
    Helper function to compute mean opp averages for a single team and season
    '''
    
    opp_avg_dict = {}

    for day in group['DayNum'].sort_values():
        prior_team_games = group[group['DayNum'] < day]
        prior_season_games = season_stats[season_stats['DayNum'] < day]

        opp_avg_dict[(season, team_id, day)] = compute_opp_avg(team_id, prior_team_games, prior_season_games, stat_cols)

    return opp_avg_dict

def get_per_game_opp_avgs(df, stat_cols=STAT_COLS):
    '''
    Computes the mean opponent stats prior to each game for each season and team 
    Note: excludes stats from games against the team in question
    
    These can be used to predict the outcome of any game in the provided dataframe
    The two opponent ids and the day num can be used as a key to join to targets.
    
    Note: Some early games for each team may be NaN, depending on if opponents have played other games yet.
    '''
    
    seasons = get_season_dfs(df)
    
    opp_avg_dict = {}
    
    for (season, team_id), group in tqdm(df.groupby(['Season', 'A_TeamID'])):
        opp_avg_dict.update(get_single_per_game_opp_avgs(season, team_id, group, seasons[season], stat_cols))

    return dict_to_df(opp_avg_dict, stat_cols, ['Season', 'A_TeamID', 'DayNum'])

def get_per_game_opp_avgs_from_full(df, full_per_game_opp_avgs, stat_cols=STAT_COLS):
    '''
    Pulls out the subset of opp avgs that correspond to before a game played by each team
    '''
    
    return full_per_game_opp_avgs.merge(df[['Season', 'A_TeamID', 'DayNum']])

def get_full_per_game_opp_avgs(df, stat_cols=STAT_COLS):
    '''
    Computes the mean opponent stats for each day of the season, for each team and season.
    This can be used to compute opponents' opponents' averages
    
    Note: This take 90+ minutes to compute, at least with this naive computation
    Note: a subset of these can be pulled to get the per_game opponent averages
    '''
    
    full_opp_avg = {}
    
    for season, season_stats in tqdm(df.groupby('Season')):
        for day in tqdm(season_stats['DayNum'].unique()):
            curr_games = season_stats[season_stats['DayNum'] <= day]
            prior_games = curr_games[curr_games['DayNum'] != day]

            for team_id in curr_games['A_TeamID'].unique():
                team_data = prior_games[prior_games['A_TeamID'] == team_id]
                full_opp_avg[(season, team_id, day)] = compute_opp_avg(team_id, team_data, prior_games, stat_cols)
                
    full_opp_avg_df = dict_to_df(full_opp_avg, stat_cols, ['Season', 'A_TeamID', 'DayNum'])
    
    return full_opp_avg_df

In [None]:
def compute_opp_opp_avgs(team_id, team_data, season_opp_avg, stat_cols):
    opps = team_data['B_TeamID']
    
    opp_means = season_opp_avg[season_opp_avg['A_TeamID'].isin(opps)].set_index('A_TeamID')[stat_cols]
    opp_counts = team_data.groupby('B_TeamID')[stat_cols].count()

    weighted_opp_means = opp_means.mul(opp_counts)
    
    return weighted_opp_means.to_numpy().sum(axis=0) / opp_counts.to_numpy().sum(axis=0)
    
def get_final_opp_opp_avgs(df, opp_avgs, stat_cols=STAT_COLS):
    season_opp_avgs = get_season_dfs(opp_avgs)
    
    opp_opp_avg_dict = {}
    
    for (team_id, season), team_season_subset in tqdm(df.groupby(['A_TeamID', 'Season'])):
        season_opp_avg = season_opp_avgs[season]
        
        opp_opp_avg_dict[(season, team_id)] = compute_opp_opp_avgs(team_id, team_season_subset, season_opp_avg, stat_cols)

    return dict_to_df(opp_opp_avg_dict, stat_cols, ['Season', 'A_TeamID'])


def get_per_game_opp_opp_avgs(df, full_opp_avgs, stat_cols=STAT_COLS):
    '''
    Compute's opponent's opponents stats for each game, based on full opponents' averages
    '''
    
    opp_opp_avg_dict = {}
    
    for season, season_stats in df.groupby('Season'):
        season_opp_avgs = full_opp_avgs[full_opp_avgs['Season'] == season]
        
        # Precompute opponent averages for each day to speed up computation
        day_opp_avgs = {}
        for day in season_stats['DayNum'].unique():
            day_opp_avgs[day] = season_opp_avgs[season_opp_avgs['DayNum'] == day]
            
        for team_id, team_data in tqdm(season_stats.groupby('A_TeamID')):
            for day in team_data['DayNum']:
                prior_games = team_data[team_data['DayNum'] < day]
                prior_means = day_opp_avgs[day]

                opp_opp_avg_dict[(season, team_id, day)] = compute_opp_opp_avgs(team_id, prior_games, prior_means, stat_cols)

    return dict_to_df(opp_opp_avg_dict, stat_cols, ['Season', 'A_TeamID', 'DayNum'])

In [None]:
def get_full_final_avgs(df, stat_cols=STAT_COLS):
    '''
    Returns a full dataframe of average statistics opponents' statistics, and opponents' opponents' statistics
    at the end of the season
    '''
    
    avgs = get_final_avgs(df)
    opp_avgs = get_final_opp_avgs(df)
    opp_opp_avgs = get_final_opp_opp_avgs(df, opp_avgs)
    
    avgs.set_index(['Season', 'A_TeamID'], inplace=True)
    opp_avgs.set_index(['Season', 'A_TeamID'], inplace=True)
    opp_opp_avgs.set_index(['Season', 'A_TeamID'], inplace=True)

    full = pd.concat((avgs, 
        opp_avgs.rename(columns={col: 'Opp_' + col for col in stat_cols}), 
        opp_opp_avgs.rename(columns={col: 'Opp_Opp_' + col for col in stat_cols})), axis=1)
    
    return full
    
def get_full_per_game_avgs(df, stat_cols=STAT_COLS):
    '''
    Return the full dataframe of averages prior to each game
    '''
    
    avgs = get_per_game_avgs(df)
    
    full_opp_avgs = get_full_per_game_opp_avgs(df)
    
    opp_avgs = get_per_game_opp_avgs_from_full(df, full_opp_avgs)
    opp_opp_avgs = get_per_game_opp_opp_avgs(df, full_opp_avgs)
    
    avgs.set_index(['Season', 'A_TeamID', 'DayNum'], inplace=True)
    opp_avgs.set_index(['Season', 'A_TeamID', 'DayNum'], inplace=True)
    opp_opp_avgs.set_index(['Season', 'A_TeamID', 'DayNum'], inplace=True)
    
    full = pd.concat((avgs,
        opp_avgs.rename(columns={col: 'Opp_' + col for col in stat_cols}), 
        opp_opp_avgs.rename(columns={col: 'Opp_Opp_' + col for col in stat_cols})), axis=1)
    
    return full_opp_avgs, full

In [None]:
class TestGetFinalAvgs(unittest.TestCase):
    def test_simple(self):
        test_matrix = np.array([
            [2000, 1, 1, 10, 20],
            [2000, 1, 2, 30, 40],
            [2000, 1, 3, 50, 60],
            [2000, 2, 1, 5, 15],
            [2000, 2, 2, 25, 35],
            [2000, 2, 3, 45, 55],
            [2001, 1, 1, 30, 40],
            [2001, 1, 2, 50, 60],
            [2001, 1, 3, 70, 80],
            [2001, 2, 1, 35, 45],
            [2001, 2, 2, 55, 65],
            [2001, 2, 3, 75, 85],
        ])
        
        test_df = pd.DataFrame(test_matrix, columns=['Season', 'A_TeamID', 'DayNum', 'A_Score', 'B_Score'])
        
        expected_avgs = np.array([
            [2000, 1, 30, 40],
            [2000, 2, 25, 35],
            [2001, 1, 50, 60],
            [2001, 2, 55, 65]
        ])
        
        actual_avgs = get_final_avgs(test_df, stat_cols=['A_Score', 'B_Score'])

        self.assertTrue((actual_avgs == expected_avgs).all().all())
        
class TestGetPerGameAvgs(unittest.TestCase):
    def test_simple(self):
        test_matrix = np.array([
            [2000, 1, 1, 10, 20],
            [2000, 1, 2, 30, 40],
            [2000, 1, 3, 50, 60],
            [2000, 2, 1, 5, 15],
            [2000, 2, 2, 25, 35],
            [2000, 2, 3, 45, 55],
            [2001, 1, 1, 30, 40],
            [2001, 1, 2, 50, 60],
            [2001, 1, 3, 70, 80],
            [2001, 2, 1, 35, 45],
            [2001, 2, 2, 55, 65],
            [2001, 2, 3, 75, 85],
        ])

        test_df = pd.DataFrame(test_matrix, columns=['Season', 'A_TeamID', 'DayNum', 'A_Score', 'B_Score'])

        expected_avgs = np.array([
            [2000, 1, 1, np.nan, np.nan],
            [2000, 1, 2, 10, 20],
            [2000, 1, 3, 20, 30],
            [2000, 2, 1, np.nan, np.nan],
            [2000, 2, 2, 5, 15],
            [2000, 2, 3, 15, 25],
            [2001, 1, 1, np.nan, np.nan],
            [2001, 1, 2, 30, 40],
            [2001, 1, 3, 40, 50],
            [2001, 2, 1, np.nan, np.nan],
            [2001, 2, 2, 35, 45],
            [2001, 2, 3, 45, 55],
        ])

        actual_avgs = get_per_game_avgs(test_df, stat_cols=['A_Score', 'B_Score'])

        self.assertTrue(np.allclose(actual_avgs, expected_avgs, equal_nan=True))
        
class TestGetFinalOppAvgs(unittest.TestCase):
    def test_simple(self):
        test_matrix = np.array([
            [2000, 1, 2, 1, 71, 74],
            [2000, 1, 3, 2, 61, 74],
            [2000, 1, 3, 3, 71, 70],
            [2000, 1, 4, 4, 62, 64],
            [2000, 1, 5, 5, 68, 68],
            [2000, 1, 5, 6, 61, 65],
            [2000, 3, 4, 7, 66, 67],
            [2000, 3, 6, 8, 61, 71],
            [2000, 4, 6, 9, 62, 61],
            [2000, 2, 1, 1, 74, 71],
            [2000, 3, 1, 2, 74, 61],
            [2000, 3, 1, 3, 70, 71],
            [2000, 4, 1, 4, 64, 62],
            [2000, 5, 1, 5, 68, 68],
            [2000, 5, 1, 6, 65, 61],
            [2000, 4, 3, 7, 67, 66],
            [2000, 6, 3, 8, 71, 61],
            [2000, 6, 4, 9, 61, 62],
            [2001, 1, 2, 1, 65, 70],
            [2001, 2, 1, 1, 70, 65]
        ])
        
        test_df = pd.DataFrame(test_matrix, columns=['Season', 'A_TeamID', 'B_TeamID', 'DayNum', 'A_Score', 'B_Score'])

        expected_opp_avgs = np.array([
            [2000, 1, 63.8333, 67.1667],
            [2001, 1, np.nan, np.nan],
            [2000, 2, 64.6, 68.2],
            [2001, 2, np.nan, np.nan],
            [2000, 3, 63.75, 64.75],
            [2000, 4, 68.5778, 66.2889],
            [2000, 5, 66.25, 70.5],
            [2000, 6, 67.75, 65.1667],
        ])
        
        actual_opp_avgs = get_final_opp_avgs(test_df, stat_cols=['A_Score', 'B_Score'])

        self.assertTrue(np.allclose(actual_opp_avgs, expected_opp_avgs, equal_nan=True, atol=1e-5))
        
class TestGetFinalOppOppAvgs(unittest.TestCase):
    def test_simple(self):
        test_matrix = np.array([
            [2000, 1, 2, 1, 71, 74],
            [2000, 1, 3, 2, 61, 74],
            [2000, 1, 3, 3, 71, 70],
            [2000, 1, 4, 4, 62, 64],
            [2000, 1, 5, 5, 68, 68],
            [2000, 1, 5, 6, 61, 65],
            [2000, 3, 4, 7, 66, 67],
            [2000, 3, 6, 8, 61, 71],
            [2000, 4, 6, 9, 62, 61],
            [2000, 2, 1, 1, 74, 71],
            [2000, 3, 1, 2, 74, 61],
            [2000, 3, 1, 3, 70, 71],
            [2000, 4, 1, 4, 64, 62],
            [2000, 5, 1, 5, 68, 68],
            [2000, 5, 1, 6, 65, 61],
            [2000, 4, 3, 7, 67, 66],
            [2000, 6, 3, 8, 71, 61],
            [2000, 6, 4, 9, 61, 62],
            [2001, 1, 2, 1, 65, 70],
            [2001, 3, 4, 2, 61, 68],
            [2001, 4, 5, 3, 74, 71],
            [2001, 2, 1, 1, 70, 65],
            [2001, 4, 3, 2, 68, 61],
            [2001, 5, 4, 3, 71, 74]
        ])
        
        test_df = pd.DataFrame(test_matrix, columns=['Season', 'A_TeamID', 'B_TeamID', 'DayNum', 'A_Score', 'B_Score'])

        test_opp_avgs = get_final_opp_avgs(test_df, stat_cols=['A_Score', 'B_Score'])
        
        expected_opp_opp_avgs = np.array([
            [2000, 1, 65.5296, 67.49815],
            [2001, 1, np.nan, np.nan],
            [2000, 2, 63.8333, 67.1667],
            [2001, 2, np.nan, np.nan],
            [2000, 3, 65.9986, 66.44725],
            [2001, 3, np.nan, np.nan],
            [2000, 4, 65.111, 65.6945],
            [2001, 4, 71, 66],
            [2000, 5, 63.8333, 67.1667],
            [2001, 5, np.nan, np.nan],
            [2000, 6, 66.1639, 65.5194],
        ])
        
        actual_opp_opp_avgs = get_final_opp_opp_avgs(test_df, test_opp_avgs, stat_cols=['A_Score', 'B_Score'])

        self.assertTrue(np.allclose(actual_opp_opp_avgs, expected_opp_opp_avgs, equal_nan=True, atol=1e-4))

In [None]:
np.seterr(divide='ignore', invalid='ignore')

TestGetFinalAvgs().test_simple()
TestGetPerGameAvgs().test_simple()
TestGetFinalOppAvgs().test_simple()
TestGetFinalOppOppAvgs().test_simple()

In [None]:
full = get_full_final_avgs(df)
full.to_csv("final_avgs.csv")

In [None]:
full_opp_avgs, full = get_full_per_game_avgs(df)
full_opp_avgs.set_index(['Season', 'A_TeamID']).to_csv("full_per_game_opp_avgs.csv")
full.to_csv("per_game_avgs.csv")

In [None]:
'''
# Per-possesion stat tests (TODO)
# Type 1: FGA-OR+TO+.475*FTA
poss1 = (0.5 * (df['A_FGA'] - df['A_OR'] + df['A_TO'] + 0.475 * df['A_FTA']) + 0.5 * (df['B_FGA'] - df['B_OR'] + df['B_TO'] + 0.475 * df['B_FTA']))

# Type 2: 0.96*[(FGA)+(TO)+0.44*(FTA)-(OR)] (slightly lower estimate)
poss2 = (0.5 * 0.96 * (df['A_FGA'] - df['A_OR'] + df['A_TO'] + 0.44 * df['A_FTA']) + 0.5 * 0.96 * (df['B_FGA'] - df['B_OR'] + df['B_TO'] + 0.44 * df['B_FTA']))

# Type 3: 0.5*((FGA + 0.4*FTA – 1.07*(OR/(OR + Opp_DR))*(FGA – FGM) + TO) + (Opp_FGA + 0.4*(Opp_FTA) – 1.07*(Opp_OR)/(Opp_OR + DR))*(Opp_FGA – Opp_FGM) + Opp_TO))
# Even lower estimate?
poss3 = (0.5 * ((df['A_FGA'] + 0.4*df['A_FTA'] - 1.07*(df['A_OR'] / (df['A_OR'] + df['B_OR']))*(df['A_FGA'] - df['A_FGM']) + df['A_TO']) + (df['B_FGA'] + 0.4*df['B_FTA'] - 1.07*(df['B_OR'] / (df['B_OR'] + df['A_OR']))*(df['B_FGA'] - df['B_FGM']) + df['B_TO'])))

stat_cols = [col for col in df.columns if col[0] in 'AB' and 'TeamID' not in col and 'Loc' not in col]
poss_stats = [col for col in stat_cols if 'Win' not in col]

pace = 40 * (poss1 / (50 + 5 * df['NumOT']))

df[poss_stats].div(poss1, axis=0).describe()
'''