# Women's Tournament Results

This notebook is meant to gather tournament finishes from previous years

In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneyDetailedResults.csv')

# only using 2012 or later, but looking at previous 4 years as well
df = df.loc[df['Season'] >= 2008, :].reset_index(drop=True)

df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2010,138,3124,69,3201,55,N,0,28,57,1,5,12,19,13,24,22,12,6,2,12,21,61,10,34,3,5,17,19,12,18,4,1,18
1,2010,138,3173,67,3395,66,N,0,23,59,9,26,12,19,13,34,13,16,3,10,14,22,73,8,27,14,15,18,26,8,8,8,6,22
2,2010,138,3181,72,3214,37,H,0,26,57,4,13,16,22,13,34,15,11,10,7,11,15,56,4,15,3,8,10,21,4,16,6,4,20
3,2010,138,3199,75,3256,61,H,0,25,63,3,15,22,26,20,27,13,17,8,3,21,21,62,2,20,17,22,16,21,13,16,5,4,24
4,2010,138,3207,62,3265,42,N,0,24,68,8,25,6,8,20,29,16,8,5,5,18,13,60,5,26,11,17,16,22,9,10,3,4,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,2023,147,3376,86,3268,75,H,0,32,70,6,15,16,26,22,23,19,12,4,6,12,29,58,7,14,10,15,6,17,15,12,7,9,26
823,2023,147,3439,84,3326,74,N,0,26,54,8,26,24,28,4,24,10,12,3,4,17,26,57,7,20,15,17,4,17,11,10,6,1,25
824,2023,151,3234,77,3376,73,N,0,28,57,7,23,14,14,3,19,14,10,8,1,18,30,77,4,20,9,13,24,22,9,15,6,5,20
825,2023,151,3261,79,3439,72,N,0,33,70,3,13,10,16,15,19,8,7,11,2,15,23,57,9,31,17,18,13,23,8,15,7,8,15


In [2]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] >= 2008, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds.insert(1, 'Region Seed', df_seeds['Region'] + df_seeds['Seed'].astype(str).str.zfill(2))

df_seeds

Unnamed: 0,Season,Region Seed,Seed,Region,Play In,TeamID
0,2008,W01,1,W,False,3163
1,2008,W02,2,W,False,3353
2,2008,W03,3,W,False,3143
3,2008,W04,4,W,False,3438
4,2008,W05,5,W,False,3330
...,...,...,...,...,...,...
963,2023,Z12,12,Z,False,3405
964,2023,Z13,13,Z,False,3387
965,2023,Z14,14,Z,False,3241
966,2023,Z15,15,Z,False,3436


In [3]:
df_games_won = (
    df
    .groupby(['Season', 'WTeamID'])
    ['WTeamID']
    .count()
    .rename('Games Won')
    .reset_index()
)

df_games_won

Unnamed: 0,Season,WTeamID,Games Won
0,2010,3114,1
1,2010,3124,4
2,2010,3163,6
3,2010,3173,1
4,2010,3181,3
...,...,...,...
418,2023,3405,1
419,2023,3417,2
420,2023,3428,2
421,2023,3437,2


Remove 1 win from teams who played a play-in game

In [4]:
df_games_won = pd.merge(
    df_games_won,
    df_seeds[['Season', 'TeamID', 'Play In']].rename(columns={'TeamID': 'WTeamID'}),
    how='left',
    on=['Season', 'WTeamID'],
)

df_games_won['Games Won'] -= df_games_won['Play In']

df_games_won

Unnamed: 0,Season,WTeamID,Games Won,Play In
0,2010,3114,1,False
1,2010,3124,4,False
2,2010,3163,6,False
3,2010,3173,1,False
4,2010,3181,3,False
...,...,...,...,...
418,2023,3405,1,False
419,2023,3417,2,False
420,2023,3428,2,False
421,2023,3437,2,False


In [5]:
df_games_lost = (
    df  # keep play-in games
    .groupby(['Season', 'LTeamID'])
    ['LTeamID']
    .count()
    .rename('Games Lost')
    .reset_index()
)

df_games_lost

Unnamed: 0,Season,LTeamID,Games Lost
0,2010,3114,1
1,2010,3122,1
2,2010,3124,1
3,2010,3132,1
4,2010,3151,1
...,...,...,...
822,2023,3436,1
823,2023,3437,1
824,2023,3439,1
825,2023,3450,1


Get results per year for every team

In [6]:
df_teams = pd.read_csv(r'..\data\unprocessed\kaggle\WTeams.csv')

df_teams

Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M
...,...,...
371,3474,Queens NC
372,3475,Southern Indiana
373,3476,Stonehill
374,3477,TX A&M Commerce


In [7]:
season_teams = [(s, t) for s in range(2008, 2024) for t in df_teams['TeamID'].unique()]

len(season_teams)

6016

In [8]:
from tqdm.autonotebook import tqdm

season_team_results = []

for season, team in tqdm(season_teams):
    if df_games_won.loc[(df_games_won['Season'] == season) & (df_games_won['WTeamID'] == team), :].shape[0] != 0:
        # team won at least one tournament game
        season_team_results.append((
            season, 
            team, 
            df_games_won.loc[(df_games_won['Season'] == season) & (df_games_won['WTeamID'] == team), 'Games Won'].iloc[0]
        ))
    elif df_games_lost.loc[(df_games_lost['Season'] == season) & (df_games_lost['LTeamID'] == team), 'Games Lost'].shape[0] != 0:
        # team did not win non-play-in game
        season_team_results.append((season, team, 0))
    else:
        # team did not make tournament
        season_team_results.append((season, team, -1))

len(season_team_results)

  from tqdm.autonotebook import tqdm


  0%|          | 0/6016 [00:00<?, ?it/s]

6016

In [9]:
df_tournament_results = pd.DataFrame(
    season_team_results,
    columns=['Season', 'TeamID', 'Result'],
)

id_to_name = dict(zip(df_teams['TeamID'], df_teams['TeamName']))

df_tournament_results.insert(
    df_tournament_results.columns.get_loc('TeamID') + 1, 
    'Team',
    df_tournament_results['TeamID'].map(id_to_name)
)

df_tournament_results

Unnamed: 0,Season,TeamID,Team,Result
0,2008,3101,Abilene Chr,-1
1,2008,3102,Air Force,-1
2,2008,3103,Akron,-1
3,2008,3104,Alabama,-1
4,2008,3105,Alabama A&M,-1
...,...,...,...,...
6011,2023,3474,Queens NC,-1
6012,2023,3475,Southern Indiana,-1
6013,2023,3476,Stonehill,-1
6014,2023,3477,TX A&M Commerce,-1


Set 2020 to NaN due to tournament cancellation

In [10]:
import numpy as np

df_tournament_results.loc[df_tournament_results['Season'] == 2020, 'Result'] = np.nan

In [11]:
df_tournament_results['Past 4 Years Tournament Results'] = (
    df_tournament_results
    .groupby(['TeamID'])
    ['Result']
    .rolling(window=4, min_periods=1)
    .mean()
    .reset_index()
    .set_index('level_1')
)['Result']

df_tournament_results.rename(
    columns={
    'Result': 'Past Year Tournament Result'
    }, 
    inplace=True
)

df_tournament_results['Season'] += 1  # shift by a year so results are from past instead of the current tourney

df_tournament_results = df_tournament_results.loc[df_tournament_results['Season'] >= 2012, :].reset_index(drop=True)

df_tournament_results

Unnamed: 0,Season,TeamID,Team,Past Year Tournament Result,Past 4 Years Tournament Results
0,2012,3101,Abilene Chr,-1.0,-1.0
1,2012,3102,Air Force,-1.0,-1.0
2,2012,3103,Akron,-1.0,-1.0
3,2012,3104,Alabama,-1.0,-1.0
4,2012,3105,Alabama A&M,-1.0,-1.0
...,...,...,...,...,...
4883,2024,3474,Queens NC,-1.0,-1.0
4884,2024,3475,Southern Indiana,-1.0,-1.0
4885,2024,3476,Stonehill,-1.0,-1.0
4886,2024,3477,TX A&M Commerce,-1.0,-1.0


In [13]:
df_tournament_results.to_csv(
    '../data/preprocessed/kaggle/womens_tournament_results.csv', 
    index=False
)

'Done'

'Done'