In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
DATA_PATH = 'drive/MyDrive/march_madness_2023/march-machine-learning-mania-2023__3-13-23/'

year = 2023

In [3]:
df_seeds = pd.read_csv(DATA_PATH + 'MNCAATourneySeeds.csv')
df_seeds = df_seeds[df_seeds.Season == year]

def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

df_seeds['SeedNumeric'] = df_seeds['Seed'].apply(treat_seed)
    
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID,SeedNumeric
2422,2023,W01,1345,1
2423,2023,W02,1266,2
2424,2023,W03,1243,3
2425,2023,W04,1397,4
2426,2023,W05,1181,5


In [4]:
df_season_results = pd.read_csv(DATA_PATH + 'MRegularSeasonCompactResults.csv')
df_season_results = df_season_results[df_season_results.Season == year]

df_season_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

df_season_results['ScoreGap'] = df_season_results['WScore'] - df_season_results['LScore']
print(df_season_results.shape)
df_season_results.head()

(5602, 7)


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,ScoreGap
176080,2023,7,1101,65,1238,56,9
176081,2023,7,1103,81,1355,80,1
176082,2023,7,1104,75,1255,54,21
176083,2023,7,1112,117,1311,75,42
176084,2023,7,1113,62,1470,59,3


In [5]:
# number of wins 
num_win = df_season_results.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "NumWins", "WTeamID": "TeamID"})

# number of losses 
num_loss = df_season_results.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses", "LTeamID": "TeamID"})

# average score gap for the winning teams
gap_win = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapWins", "WTeamID": "TeamID"})

# average score gap for the losing teams
gap_loss = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapLosses", "LTeamID": "TeamID"})

print(gap_loss.shape)
gap_loss.head()

(363, 3)


Unnamed: 0,Season,TeamID,GapLosses
0,2023,1101,11.647059
1,2023,1102,10.055556
2,2023,1103,11.454545
3,2023,1104,12.8
4,2023,1105,11.388889


In [6]:
df_season_stats = pd.read_csv(DATA_PATH + 'MRegularSeasonDetailedResults.csv')
df_season_stats = df_season_stats[df_season_stats.Season == year]
df_season_stats.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

# advanced metrics for winning teams
df_season_stats['WPoss'] = 0.96 * (df_season_stats['WFGA'] - df_season_stats['WOR'] 
                                   + df_season_stats['WTO'] + (0.475 * df_season_stats['WFTA']))
df_season_stats['WOffEff'] = df_season_stats['WScore'] / df_season_stats['WPoss'] * 100
df_season_stats['WDefEff'] = df_season_stats['LScore'] / df_season_stats['WPoss'] * 100
df_season_stats['WEFT'] = (df_season_stats['WFGM'] + 0.5 * df_season_stats['WFGM3']) / df_season_stats['WFGA']

# advanced metrics for losing teams
df_season_stats['LPoss'] = 0.96 * (df_season_stats['LFGA'] - df_season_stats['LOR'] 
                                   + df_season_stats['LTO'] + (0.475 * df_season_stats['LFTA']))
df_season_stats['LOffEff'] = df_season_stats['LScore'] / df_season_stats['LPoss'] * 100
df_season_stats['LDefEff'] = df_season_stats['WScore'] / df_season_stats['LPoss'] * 100
df_season_stats['LEFT'] = (df_season_stats['LFGM'] + 0.5 * df_season_stats['LFGM3']) / df_season_stats['LFGA']

print(df_season_stats.shape)
print(df_season_stats.columns)
df_season_stats.head()

(5602, 40)
Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WFGM',
       'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WPoss', 'WOffEff',
       'WDefEff', 'WEFT', 'LPoss', 'LOffEff', 'LDefEff', 'LEFT'],
      dtype='object')


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WFGM,WFGA,WFGM3,WFGA3,...,LBlk,LPF,WPoss,WOffEff,WDefEff,WEFT,LPoss,LOffEff,LDefEff,LEFT
102032,2023,7,1101,65,1238,56,23,57,8,28,...,1,21,74.4,87.365591,75.268817,0.473684,72.624,77.109495,89.502093,0.418182
102033,2023,7,1103,81,1355,80,30,69,11,31,...,4,15,72.672,111.459709,110.083664,0.514493,71.112,112.498594,113.904826,0.559322
102034,2023,7,1104,75,1255,54,27,69,3,28,...,3,22,78.096,96.035648,69.145667,0.413043,73.944,73.028238,101.428108,0.302817
102035,2023,7,1112,117,1311,75,38,53,11,18,...,1,29,85.44,136.938202,87.780899,0.820755,86.832,86.373687,134.742952,0.451389
102036,2023,7,1113,62,1470,59,21,62,6,24,...,4,27,79.464,78.022752,74.247458,0.387097,78.912,74.766829,78.568532,0.363636


In [7]:
# average advanced metrics for winning teams
win_metrics = df_season_stats.groupby(['Season', 'WTeamID']).mean().reset_index()
win_metrics = win_metrics[['Season', 'WTeamID', 'WOffEff', 'WDefEff', 'WEFT']]

# average advanced metrics for losing teams
loss_metrics = df_season_stats.groupby(['Season', 'LTeamID']).mean().reset_index()
loss_metrics = loss_metrics[['Season', 'LTeamID', 'LOffEff', 'LDefEff', 'LEFT']]

adv_metrics = win_metrics.merge(loss_metrics, 
                                left_on=['Season','WTeamID'],
                                right_on=['Season','LTeamID'],
                                how='outer').drop('LTeamID', axis=1).rename(columns={'WTeamID':'TeamID'})
adv_metrics.fillna(0, inplace=True)

print(adv_metrics.shape)
adv_metrics.head()

(363, 8)


Unnamed: 0,Season,TeamID,WOffEff,WDefEff,WEFT,LOffEff,LDefEff,LEFT
0,2023,1101,118.156586,100.892684,0.565365,96.588495,113.56227,0.459465
1,2023,1102,123.268519,102.372846,0.598361,97.950638,114.994466,0.501168
2,2023,1103,119.923949,96.765685,0.558467,97.141591,114.51155,0.450838
3,2023,1104,117.045742,91.166752,0.539657,95.564555,113.360512,0.46556
4,2023,1105,107.541249,93.299494,0.541139,94.803071,111.081212,0.455577


In [8]:
df_ranks = pd.read_csv(DATA_PATH + 'MMasseyOrdinals_2023_133_only_61systems.csv.xls')
df_ranks = df_ranks[df_ranks.Season == year]

df_ranks = df_ranks[df_ranks.RankingDayNum == 133].drop(['RankingDayNum'], axis=1)

# Create a DataFrame for the 5 ranking systems with the most data

df_ranks_clean = df_ranks[df_ranks.SystemName == 'MOR'].drop(['SystemName'], axis=1).rename(columns={'OrdinalRank':'MOR'})
print(df_ranks_clean.shape)

top5ranks = ['MOR','SAG','WLK','POM','DOL']

for sys in top5ranks[1:5]:
  tmp = df_ranks[df_ranks.SystemName == sys].drop(['SystemName'], axis=1).rename(columns={'OrdinalRank':sys})
  df_ranks_clean = pd.merge(df_ranks_clean, tmp, on=['Season', 'TeamID'], how='outer')
  sys_max = df_ranks_clean[sys].max()
  df_ranks_clean.fillna(sys_max, inplace=True)

# Only use average rank since you do not gain much from including multiple rankings
df_avg_rank = df_ranks_clean.drop(top5ranks, axis=1)
df_avg_rank['AvgRank'] = df_ranks_clean.drop(['Season', 'TeamID'], axis=1).mean(axis=1)

print(df_avg_rank.shape)
df_avg_rank.head()

(363, 3)
(363, 3)


Unnamed: 0,Season,TeamID,AvgRank
0,2023,1101,209.4
1,2023,1102,159.0
2,2023,1103,106.6
3,2023,1104,1.6
4,2023,1105,319.4


In [9]:
# Season, single value data
df_features_season_w = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_l = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})

df_features_season = pd.concat([df_features_season_w, df_features_season_l], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

df_features_season = pd.merge(
    df_features_season, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamID'], 
    right_on=['Season', 'TeamID']
).drop(['Seed'], axis=1).rename(columns={'SeedNumeric':'Seed'})

df_features_season.fillna(17, inplace=True) # add max+1 for no tournament seeds

df_features_season = df_features_season.merge(num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(adv_metrics, on=['Season', 'TeamID'], how='left')

df_features_season.fillna(0, inplace=True) # add zeros where teams won or lost 0 games

df_features_season['OffEff'] = ((df_features_season['WOffEff'] * df_features_season['NumWins'] + 
                                 df_features_season['LOffEff'] * df_features_season['NumLosses']) 
                                / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season['DefEff'] = ((df_features_season['WDefEff'] * df_features_season['NumWins'] + 
                                 df_features_season['LDefEff'] * df_features_season['NumLosses']) 
                                / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season['EFT'] = ((df_features_season['WEFT'] * df_features_season['NumWins'] + 
                                 df_features_season['LEFT'] * df_features_season['NumLosses']) 
                                / (df_features_season['NumWins'] + df_features_season['NumLosses']))

df_features_season['WinRatio'] = df_features_season['NumWins'] / (df_features_season['NumWins'] + df_features_season['NumLosses'])
df_features_season['GapAvg'] = (
    (df_features_season['NumWins'] * df_features_season['GapWins'] - 
    df_features_season['NumLosses'] * df_features_season['GapLosses'])
    / (df_features_season['NumWins'] + df_features_season['NumLosses'])
)

df_features_season.drop(['NumWins', 'NumLosses', 'GapWins', 'GapLosses',
                         'WOffEff', 'LOffEff', 'WDefEff', 'LDefEff', 'WEFT', 'LEFT'], axis=1, inplace=True)
 
df_features_season = df_features_season.merge(df_avg_rank, on=['Season', 'TeamID'], how='left')

print(df_features_season.shape)
df_features_season.head()

(363, 9)


Unnamed: 0,Season,TeamID,Seed,OffEff,DefEff,EFT,WinRatio,GapAvg,AvgRank
0,2023,1101,17.0,104.054373,109.176644,0.496123,0.346154,-3.692308,209.4
1,2023,1102,17.0,109.027211,109.472507,0.54369,0.4375,-0.125,159.0
2,2023,1103,17.0,111.839886,103.062605,0.520276,0.645161,5.83871,106.6
3,2023,1104,1.0,113.886744,94.430541,0.52876,0.852941,13.676471,1.6
4,2023,1105,17.0,99.898342,103.968525,0.489802,0.4,-3.066667,319.4


In [10]:
df_features_season.to_csv(DATA_PATH + 'Mtesting_features_2023.csv', index=False)