# Men's Tournament Results

This notebook is meant to gather tournament finishes from previous years

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv(r'..\data\unprocessed\kaggle\MNCAATourneyDetailedResults.csv')

# only using 2012 or later, but looking at previous 4 years as well
df = df.loc[df['Season'] >= 2008, :].reset_index(drop=True)

df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2008,134,1291,69,1164,60,N,0,19,55,5,20,26,32,13,28,10,11,7,3,16,25,57,4,17,6,13,8,26,9,11,2,6,22
1,2008,136,1181,71,1125,70,N,0,25,58,6,21,15,21,13,27,9,15,11,2,17,26,59,8,23,10,15,9,24,12,14,7,2,21
2,2008,136,1242,85,1340,61,N,0,33,61,12,25,7,15,15,23,21,11,10,3,15,21,55,9,25,10,14,13,18,11,16,8,3,18
3,2008,136,1243,80,1425,67,N,0,29,60,7,16,15,26,21,23,15,13,7,1,21,21,50,6,12,19,27,9,18,12,11,6,4,24
4,2008,136,1266,74,1246,66,N,0,23,52,5,13,23,29,15,19,10,7,5,3,18,23,48,8,20,12,17,9,17,13,12,4,5,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,2023,146,1274,88,1400,81,N,0,29,49,2,8,28,32,4,19,11,12,9,1,14,30,60,10,25,11,15,9,14,20,13,6,1,23
991,2023,146,1361,57,1166,56,N,0,25,66,3,13,4,6,12,23,6,7,1,3,11,22,55,2,17,10,11,8,24,9,9,3,2,11
992,2023,152,1163,72,1274,59,N,0,28,57,9,26,7,13,13,27,19,14,5,5,11,20,62,7,20,12,12,12,17,10,9,8,1,12
993,2023,152,1361,72,1194,71,N,0,25,57,9,18,13,22,12,23,8,8,3,2,17,23,52,9,22,16,21,7,24,6,9,6,2,17


In [2]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\MNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] >= 2008, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds.insert(1, 'Region Seed', df_seeds['Region'] + df_seeds['Seed'].astype(str).str.zfill(2))

df_seeds

Unnamed: 0,Season,Region Seed,Seed,Region,Play In,TeamID
0,2008,W01,1,W,False,1314
1,2008,W02,2,W,False,1397
2,2008,W03,3,W,False,1257
3,2008,W04,4,W,False,1450
4,2008,W05,5,W,False,1323
...,...,...,...,...,...,...
1006,2023,Z12,12,Z,False,1433
1007,2023,Z13,13,Z,False,1233
1008,2023,Z14,14,Z,False,1213
1009,2023,Z15,15,Z,False,1421


In [3]:
df_games_won = (
    df
    .groupby(['Season', 'WTeamID'])
    ['WTeamID']
    .count()
    .rename('Games Won')
    .reset_index()
)

df_games_won

Unnamed: 0,Season,WTeamID,Games Won
0,2008,1116,1
1,2008,1139,1
2,2008,1172,3
3,2008,1181,1
4,2008,1207,1
...,...,...,...
514,2023,1395,1
515,2023,1397,2
516,2023,1400,3
517,2023,1417,2


Remove 1 win from teams who played a play-in game

In [4]:
df_games_won = pd.merge(
    df_games_won,
    df_seeds[['Season', 'TeamID', 'Play In']].rename(columns={'TeamID': 'WTeamID'}),
    how='left',
    on=['Season', 'WTeamID'],
)

df_games_won['Games Won'] -= df_games_won['Play In']

df_games_won

Unnamed: 0,Season,WTeamID,Games Won,Play In
0,2008,1116,1,False
1,2008,1139,1,False
2,2008,1172,3,False
3,2008,1181,1,False
4,2008,1207,1,False
...,...,...,...,...
514,2023,1395,1,False
515,2023,1397,2,False
516,2023,1400,3,False
517,2023,1417,2,False


In [5]:
df_games_lost = (
    df  # keep play-in games
    .groupby(['Season', 'LTeamID'])
    ['LTeamID']
    .count()
    .rename('Games Lost')
    .reset_index()
)

df_games_lost

Unnamed: 0,Season,LTeamID,Games Lost
0,2008,1110,1
1,2008,1112,1
2,2008,1116,1
3,2008,1122,1
4,2008,1124,1
...,...,...,...
990,2023,1433,1
991,2023,1436,1
992,2023,1438,1
993,2023,1452,1


Get results per year for every team

In [6]:
df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\MTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2024
1,1102,Air Force,1985,2024
2,1103,Akron,1985,2024
3,1104,Alabama,1985,2024
4,1105,Alabama A&M,2000,2024
...,...,...,...,...
373,1474,Queens NC,2023,2024
374,1475,Southern Indiana,2023,2024
375,1476,Stonehill,2023,2024
376,1477,TX A&M Commerce,2023,2024


In [7]:
season_teams = [(s, t) for s in range(2008, 2024) for t in df_teams['TeamID'].unique()]

len(season_teams)

6048

In [8]:
from tqdm.autonotebook import tqdm

season_team_results = []

for season, team in tqdm(season_teams):
    if df_games_won.loc[(df_games_won['Season'] == season) & (df_games_won['WTeamID'] == team), :].shape[0] != 0:
        # team won at least one tournament game
        season_team_results.append((
            season, 
            team, 
            df_games_won.loc[(df_games_won['Season'] == season) & (df_games_won['WTeamID'] == team), 'Games Won'].iloc[0]
        ))
    elif df_games_lost.loc[(df_games_lost['Season'] == season) & (df_games_lost['LTeamID'] == team), 'Games Lost'].shape[0] != 0:
        # team did not win non-play-in game
        season_team_results.append((season, team, 0))
    else:
        # team did not make tournament
        season_team_results.append((season, team, -1))

len(season_team_results)

  from tqdm.autonotebook import tqdm


  0%|          | 0/6048 [00:00<?, ?it/s]

6048

In [9]:
df_tournament_results = pd.DataFrame(
    season_team_results,
    columns=['Season', 'TeamID', 'Result'],
)

id_to_name = dict(zip(df_teams['TeamID'], df_teams['TeamName']))

df_tournament_results.insert(
    df_tournament_results.columns.get_loc('TeamID') + 1, 
    'Team',
    df_tournament_results['TeamID'].map(id_to_name)
)

df_tournament_results

Unnamed: 0,Season,TeamID,Team,Result
0,2008,1101,Abilene Chr,-1
1,2008,1102,Air Force,-1
2,2008,1103,Akron,-1
3,2008,1104,Alabama,-1
4,2008,1105,Alabama A&M,-1
...,...,...,...,...
6043,2023,1474,Queens NC,-1
6044,2023,1475,Southern Indiana,-1
6045,2023,1476,Stonehill,-1
6046,2023,1477,TX A&M Commerce,-1


Set 2020 to NaN due to tournament cancellation

In [10]:
import numpy as np

df_tournament_results.loc[df_tournament_results['Season'] == 2020, 'Result'] = np.nan

In [11]:
df_tournament_results['Past 4 Years Tournament Results'] = (
    df_tournament_results
    .groupby(['TeamID'])
    ['Result']
    .rolling(window=4, min_periods=1)
    .mean()
    .reset_index()
    .set_index('level_1')
)['Result']

df_tournament_results.rename(
    columns={
    'Result': 'Past Year Tournament Result'
    }, 
    inplace=True
)

df_tournament_results['Season'] += 1  # shift by a year so results are from past instead of the current tourney

df_tournament_results = df_tournament_results.loc[df_tournament_results['Season'] >= 2012, :].reset_index(drop=True)

df_tournament_results

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results
0,2012,1101,Abilene Chr,-1.0,-1.0
1,2012,1102,Air Force,-1.0,-1.0
2,2012,1103,Akron,0.0,-0.5
3,2012,1104,Alabama,-1.0,-1.0
4,2012,1105,Alabama A&M,-1.0,-1.0
...,...,...,...,...,...
4909,2024,1474,Queens NC,-1.0,-1.0
4910,2024,1475,Southern Indiana,-1.0,-1.0
4911,2024,1476,Stonehill,-1.0,-1.0
4912,2024,1477,TX A&M Commerce,-1.0,-1.0


In [13]:
df_tournament_results.to_csv(
    '../data/preprocessed/kaggle/tournament_results.csv', 
    index=False
)

'Done'

'Done'