In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
DATA_PATH = 'drive/MyDrive/march_madness/2024/march-machine-learning-mania-2024/'

year = 2024

In [3]:
df_seeds = pd.read_csv(DATA_PATH + '2024_tourney_seeds.csv')

def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

df_seeds['SeedNumeric'] = df_seeds['Seed'].apply(treat_seed)
df_seeds = df_seeds.drop(columns=['Tournament'])
df_seeds['Season'] = [2024]*df_seeds.shape[0]

df_seeds.head()

Unnamed: 0,Seed,TeamID,SeedNumeric,Season
0,W01,1163,1,2024
1,W02,1235,2,2024
2,W03,1228,3,2024
3,W04,1120,4,2024
4,W05,1361,5,2024


In [4]:
tmp1 = pd.read_csv(DATA_PATH + 'MRegularSeasonCompactResults.csv')
tmp2 = pd.read_csv(DATA_PATH + 'WRegularSeasonCompactResults.csv')
df_season_results = pd.concat((tmp1, tmp2))
df_season_results = df_season_results[df_season_results.Season == year]

df_season_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

print(df_season_results.shape)
df_season_results.head()

(11021, 6)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
181682,2024,0,1101,64,1329,59
181683,2024,0,1103,81,1355,75
181684,2024,0,1104,105,1287,73
181685,2024,0,1112,122,1288,59
181686,2024,0,1114,71,1402,66


In [5]:
# number of wins
num_win = df_season_results.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "NumWins", "WTeamID": "TeamID"})

# number of losses
num_loss = df_season_results.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses", "LTeamID": "TeamID"})

print(num_loss.shape)
num_loss.head()

(721, 3)


Unnamed: 0,Season,TeamID,NumLosses
0,2024,1101,17
1,2024,1102,22
2,2024,1103,10
3,2024,1104,11
4,2024,1105,22


In [6]:
# number of wins over last 14 days
num_win_last14 = df_season_results[df_season_results.DayNum >= 132-14].groupby(['Season', 'WTeamID']).count()
num_win_last14 = num_win_last14.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "NumWins14", "WTeamID": "TeamID"})

# number of losses over last 14 days
num_loss_last14 = df_season_results[df_season_results.DayNum >= 132-14].groupby(['Season', 'LTeamID']).count()
num_loss_last14 = num_loss_last14.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses14", "LTeamID": "TeamID"})

# create one dataframe with the ratio over last 14 days
win_ratio_last14 = num_win_last14.merge(num_loss_last14,
                                       left_on=['Season','TeamID'],
                                       right_on=['Season','TeamID'],
                                       how='outer').fillna(0)

win_ratio_last14['WinRatio14'] = win_ratio_last14['NumWins14'] / (win_ratio_last14['NumWins14'] + win_ratio_last14['NumLosses14'])
win_ratio_last14.drop(columns=['NumWins14', 'NumLosses14'], inplace=True)
win_ratio_last14

Unnamed: 0,Season,TeamID,WinRatio14
0,2024,1101,0.333333
1,2024,1103,0.600000
2,2024,1104,0.333333
3,2024,1105,0.500000
4,2024,1106,0.333333
...,...,...,...
701,2024,3466,0.000000
702,2024,3468,0.000000
703,2024,3471,0.000000
704,2024,3472,0.000000


In [7]:
tmp1 = pd.read_csv(DATA_PATH + 'MRegularSeasonDetailedResults.csv')
tmp2 = pd.read_csv(DATA_PATH + 'WRegularSeasonDetailedResults.csv')
df_season_stats = pd.concat((tmp1, tmp2))
df_season_stats = df_season_stats[df_season_stats.Season == year]
df_season_stats.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

# advanced metrics for winning teams
df_season_stats['WPoss'] = 0.96 * (df_season_stats['WFGA'] - df_season_stats['WOR']
                                   + df_season_stats['WTO'] + (0.475 * df_season_stats['WFTA']))
df_season_stats['WOffEff'] = df_season_stats['WScore'] / df_season_stats['WPoss'] * 100
df_season_stats['WDefEff'] = df_season_stats['LScore'] / df_season_stats['WPoss'] * 100
df_season_stats['WEFT'] = (df_season_stats['WFGM'] + 0.5 * df_season_stats['WFGM3']) / df_season_stats['WFGA']

# advanced metrics for losing teams
df_season_stats['LPoss'] = 0.96 * (df_season_stats['LFGA'] - df_season_stats['LOR']
                                   + df_season_stats['LTO'] + (0.475 * df_season_stats['LFTA']))
df_season_stats['LOffEff'] = df_season_stats['LScore'] / df_season_stats['LPoss'] * 100
df_season_stats['LDefEff'] = df_season_stats['WScore'] / df_season_stats['LPoss'] * 100
df_season_stats['LEFT'] = (df_season_stats['LFGM'] + 0.5 * df_season_stats['LFGM3']) / df_season_stats['LFGA']

print(df_season_stats.shape)
print(df_season_stats.columns)
df_season_stats.head()

(11021, 40)
Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WFGM',
       'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WPoss', 'WOffEff',
       'WDefEff', 'WEFT', 'LPoss', 'LOffEff', 'LDefEff', 'LEFT'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_stats.drop(['NumOT', 'WLoc'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_stats['WPoss'] = 0.96 * (df_season_stats['WFGA'] - df_season_stats['WOR']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_season_stats['WOffEff'] = df_season_stats['WScore'] / df_season_stats['WPoss'] * 100
A value is trying to be set 

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WFGM,WFGA,WFGM3,WFGA3,...,LBlk,LPF,WPoss,WOffEff,WDefEff,WEFT,LPoss,LOffEff,LDefEff,LEFT
107634,2024,0,1101,64,1329,59,26,57,5,18,...,2,16,67.416,94.932954,87.516317,0.5,68.64,85.955711,93.240093,0.455357
107635,2024,0,1103,81,1355,75,26,57,11,27,...,2,17,67.632,119.765791,110.894251,0.552632,66.408,112.9382,121.973256,0.571429
107636,2024,0,1104,105,1287,73,32,57,10,23,...,3,25,73.008,143.819855,99.989042,0.649123,71.064,102.724305,147.754137,0.491935
107637,2024,0,1112,122,1288,59,42,76,12,27,...,6,25,75.072,162.510656,78.591219,0.631579,78.84,74.835109,154.743785,0.455357
107638,2024,0,1114,71,1402,66,22,59,5,18,...,4,23,72.744,97.602551,90.729132,0.415254,74.352,88.766946,95.491715,0.427419


In [8]:
# average advanced metrics for winning teams
win_metrics = df_season_stats.groupby(['Season', 'WTeamID']).mean().reset_index()
win_metrics = win_metrics[['Season', 'WTeamID', 'WOffEff', 'WDefEff', 'WEFT']]

# average advanced metrics for losing teams
loss_metrics = df_season_stats.groupby(['Season', 'LTeamID']).mean().reset_index()
loss_metrics = loss_metrics[['Season', 'LTeamID', 'LOffEff', 'LDefEff', 'LEFT']]

adv_metrics = win_metrics.merge(loss_metrics,
                                left_on=['Season','WTeamID'],
                                right_on=['Season','LTeamID'],
                                how='outer').drop('LTeamID', axis=1).rename(columns={'WTeamID':'TeamID'})
adv_metrics.fillna(0, inplace=True)

print(adv_metrics.shape)
adv_metrics.head()

(722, 8)


Unnamed: 0,Season,TeamID,WOffEff,WDefEff,WEFT,LOffEff,LDefEff,LEFT
0,2024,1101.0,108.957627,97.749715,0.518581,94.784868,111.007206,0.431939
1,2024,1102.0,116.043947,97.177251,0.567907,105.739578,129.24332,0.52502
2,2024,1103.0,114.152676,97.475345,0.533731,103.61208,113.720205,0.490937
3,2024,1104.0,131.663957,101.508178,0.602501,110.01669,126.785556,0.496075
4,2024,1105.0,105.303204,92.317963,0.476187,94.56794,115.294569,0.448537


In [9]:
# Season, single value data
df_features_season_w = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_l = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})

df_features_season = pd.concat([df_features_season_w, df_features_season_l], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

df_features_season = pd.merge(
    df_features_season,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamID'],
    right_on=['Season', 'TeamID']
).drop(['Seed'], axis=1).rename(columns={'SeedNumeric':'Seed'})

df_features_season.fillna(17, inplace=True) # add max+1 for no tournament seeds

df_features_season = df_features_season.merge(num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(win_ratio_last14, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(adv_metrics, on=['Season', 'TeamID'], how='left')

df_features_season.fillna(0, inplace=True) # add zeros where teams won or lost 0 games

df_features_season['OffEff'] = ((df_features_season['WOffEff'] * df_features_season['NumWins'] +
                                 df_features_season['LOffEff'] * df_features_season['NumLosses'])
                                / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season['DefEff'] = ((df_features_season['WDefEff'] * df_features_season['NumWins'] +
                                 df_features_season['LDefEff'] * df_features_season['NumLosses'])
                                / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season['EFT'] = ((df_features_season['WEFT'] * df_features_season['NumWins'] +
                                 df_features_season['LEFT'] * df_features_season['NumLosses'])
                                / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season['WinRatio'] = df_features_season['NumWins'] / (df_features_season['NumWins'] + df_features_season['NumLosses'])

df_features_season.drop(['NumWins', 'NumLosses', 'WOffEff', 'LOffEff', 'WDefEff', 'LDefEff', 'WEFT', 'LEFT'], axis=1, inplace=True)


print(df_features_season.shape)
df_features_season.head()

(722, 8)


Unnamed: 0,Season,TeamID,Seed,WinRatio14,OffEff,DefEff,EFT,WinRatio
0,2024,1101,17.0,0.333333,101.185469,105.019952,0.471068,0.451613
1,2024,1102,17.0,0.0,108.731169,119.933816,0.537471,0.290323
2,2024,1103,14.0,0.6,110.85874,102.551863,0.520358,0.6875
3,2024,1104,4.0,0.333333,124.222709,110.197276,0.565917,0.65625
4,2024,1105,17.0,0.5,98.146361,107.635701,0.457754,0.333333


In [10]:
df_features_season.to_csv('drive/MyDrive/march_madness/2024/features_2024.csv', index=False)