# Get Yahoo Public Picks

Used to compare my picks with general public concensus. Assumes yahoo data has been exported to Excel. 

In [1]:
season = 2024

season

2024

In [3]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.concat(
    [
        pd.read_excel(fr'..\data\unprocessed\womens_yahoo\{season}_womens_yahoo_picks.xlsx', sheet_name=f'round_{round_}')
        .assign(Round=round_)
        for round_ in range(1, 7)
    ],
    ignore_index=True,
)

df['Seed'] = df['Team (Seed)'].str.extract(r'\((\d+)\)').astype(int)
df['Team (Seed)'] = df['Team (Seed)'].str.replace(r'\(\d+\)', '', regex=True)

df.rename(columns={'Team (Seed)': 'Team'}, inplace=True)

df = df.pivot(index=['Team', 'Seed'], columns=['Round'], values='% Picked').reset_index()
df.columns = ['Team', 'Seed', 'Round 1', 'Round 2', 'Round 3', 'Round 4', 'Round 5', 'Round 6']

df

Unnamed: 0,Team,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,AUB/ARIZ,11,0.1524,0.0182,0.0066,0.0027,0.0011,0.0005
1,Alabama,8,0.5112,0.0351,0.0151,0.0058,0.0023,0.0010
2,Baylor,5,0.9412,0.5772,0.1278,0.0305,0.0067,0.0022
3,California Baptist,15,0.0320,0.0136,0.0049,0.0019,0.0008,0.0004
4,Chattanooga,14,0.0729,0.0226,0.0071,0.0027,0.0011,0.0006
...,...,...,...,...,...,...,...,...
59,USC,1,0.9674,0.8767,0.7212,0.3971,0.0898,0.0303
60,Utah,5,0.8490,0.3177,0.0555,0.0145,0.0039,0.0015
61,Vanderbilt,12,0.0829,0.0248,0.0076,0.0033,0.0014,0.0008
62,Virginia Tech,4,0.8706,0.3668,0.0691,0.0228,0.0065,0.0034


Fix play-ins

In [4]:
df.loc[df['Team'].str.contains('/', regex=False), :]

Unnamed: 0,Team,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,AUB/ARIZ,11,0.1524,0.0182,0.0066,0.0027,0.0011,0.0005
17,HC/UTM,16,0.0167,0.0078,0.0039,0.002,0.0009,0.0003


In [5]:
df.loc[df['Team'] == 'AUB/ARIZ', 'Team'] = 'Arizona'
df.loc[df['Team'] == 'HC/UTM', 'Team'] = 'UT Martin'

df.loc[df['Team'].str.contains('/', regex=False), :]

Unnamed: 0,Team,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6


Map with Kaggle data

In [9]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\WNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] == season, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds

Unnamed: 0,Season,Seed,Region,Play In,TeamID
0,2024,1,W,False,3376
1,2024,2,W,False,3323
2,2024,3,W,False,3333
3,2024,4,W,False,3231
4,2024,5,W,False,3328
...,...,...,...,...,...
63,2024,12,Z,True,3435
64,2024,13,Z,False,3267
65,2024,14,Z,False,3238
66,2024,15,Z,False,3263


In [10]:
df_spellings = pd.read_csv(
    r'..\data\unprocessed\kaggle\WTeamSpellings.csv', 
    encoding='cp1252'  # fixes issue with fancy quotes
)

# df_spellings.loc[df_spellings.shape[0]] = ['fdu', 1192]
# df_spellings.loc[df_spellings.shape[0]] = ['sdsu', 1361]
# df_spellings.loc[df_spellings.shape[0]] = ['csu', 1161]

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,3394
1,a&m-corpus christi,3394
2,abilene chr,3101
3,abilene christian,3101
4,abilene-christian,3101
...,...,...
1153,youngstown st,3464
1154,youngstown st.,3464
1155,youngstown state,3464
1156,youngstown-st,3464


In [11]:
df_spellings = pd.merge(
    df_spellings,
    df_seeds[['TeamID', 'Seed']],
    how='inner',
    on=['TeamID']
)

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID,Seed
0,a&m-corpus chris,3394,16
1,a&m-corpus christi,3394,16
2,am-corpus-chris,3394,16
3,tam c,3394,16
4,tam c. christi,3394,16
...,...,...,...
166,vanderbilt,3435,12
167,virginia tech,3439,4
168,virginia-tech,3439,4
169,west virginia,3452,8


In [12]:
from fuzzywuzzy.fuzz import token_sort_ratio
from fuzzywuzzy import process
from tqdm.autonotebook import tqdm

team_spellings = df_spellings['TeamNameSpelling'].unique()
yahoo_teams = df.loc[~df['Team'].str.contains('^playin', regex=True), 'Team'].unique()

df_match = pd.DataFrame(
    [
        [
            yahoo_team,
            *process.extract(
                yahoo_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for yahoo_team in tqdm(yahoo_teams)
    ],
    columns=['Yahoo Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  from tqdm.autonotebook import tqdm


  0%|          | 0/64 [00:00<?, ?it/s]

Unnamed: 0,Yahoo Team,Team Spelling,Match Score
0,N. Carolina,north carolina,83
1,Fla Gulf Coast,fl gulf coast,96
2,Arizona,arizona,100
3,Mississippi,mississippi,100
4,N.C. State,n.c. state,100
5,Nebraska,nebraska,100
6,Norfolk St.,norfolk st,100
7,Notre Dame,notre dame,100
8,Ohio St.,ohio st,100
9,Oklahoma,oklahoma,100


In [13]:
yahoo_to_spelling = dict(zip(df_match['Yahoo Team'], df_match['Team Spelling']))
spelling_to_id = dict(zip(df_spellings['TeamNameSpelling'], df_spellings['TeamID']))

df.insert(1, 'TeamID', df['Team'].map(yahoo_to_spelling).map(spelling_to_id))

df

Unnamed: 0,Team,TeamID,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Arizona,3112,11,0.1524,0.0182,0.0066,0.0027,0.0011,0.0005
1,Alabama,3104,8,0.5112,0.0351,0.0151,0.0058,0.0023,0.0010
2,Baylor,3124,5,0.9412,0.5772,0.1278,0.0305,0.0067,0.0022
3,California Baptist,3465,15,0.0320,0.0136,0.0049,0.0019,0.0008,0.0004
4,Chattanooga,3151,14,0.0729,0.0226,0.0071,0.0027,0.0011,0.0006
...,...,...,...,...,...,...,...,...,...
59,USC,3425,1,0.9674,0.8767,0.7212,0.3971,0.0898,0.0303
60,Utah,3428,5,0.8490,0.3177,0.0555,0.0145,0.0039,0.0015
61,Vanderbilt,3435,12,0.0829,0.0248,0.0076,0.0033,0.0014,0.0008
62,Virginia Tech,3439,4,0.8706,0.3668,0.0691,0.0228,0.0065,0.0034


In [14]:
import numpy as np

id_to_playin = dict(zip(df_seeds['TeamID'], df_seeds['Play In']))
id_to_playin[np.nan] = True

df.insert(3, 'Play In', df['TeamID'].map(id_to_playin))

df

Unnamed: 0,Team,TeamID,Seed,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Arizona,3112,11,True,0.1524,0.0182,0.0066,0.0027,0.0011,0.0005
1,Alabama,3104,8,False,0.5112,0.0351,0.0151,0.0058,0.0023,0.0010
2,Baylor,3124,5,False,0.9412,0.5772,0.1278,0.0305,0.0067,0.0022
3,California Baptist,3465,15,False,0.0320,0.0136,0.0049,0.0019,0.0008,0.0004
4,Chattanooga,3151,14,False,0.0729,0.0226,0.0071,0.0027,0.0011,0.0006
...,...,...,...,...,...,...,...,...,...,...
59,USC,3425,1,False,0.9674,0.8767,0.7212,0.3971,0.0898,0.0303
60,Utah,3428,5,False,0.8490,0.3177,0.0555,0.0145,0.0039,0.0015
61,Vanderbilt,3435,12,True,0.0829,0.0248,0.0076,0.0033,0.0014,0.0008
62,Virginia Tech,3439,4,False,0.8706,0.3668,0.0691,0.0228,0.0065,0.0034


In [15]:
id_to_region = dict(zip(df_seeds['TeamID'], df_seeds['Region']))

df.insert(3, 'Region', df['TeamID'].map(id_to_region))

df

Unnamed: 0,Team,TeamID,Seed,Region,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Arizona,3112,11,Z,True,0.1524,0.0182,0.0066,0.0027,0.0011,0.0005
1,Alabama,3104,8,X,False,0.5112,0.0351,0.0151,0.0058,0.0023,0.0010
2,Baylor,3124,5,Z,False,0.9412,0.5772,0.1278,0.0305,0.0067,0.0022
3,California Baptist,3465,15,Y,False,0.0320,0.0136,0.0049,0.0019,0.0008,0.0004
4,Chattanooga,3151,14,X,False,0.0729,0.0226,0.0071,0.0027,0.0011,0.0006
...,...,...,...,...,...,...,...,...,...,...,...
59,USC,3425,1,Z,False,0.9674,0.8767,0.7212,0.3971,0.0898,0.0303
60,Utah,3428,5,X,False,0.8490,0.3177,0.0555,0.0145,0.0039,0.0015
61,Vanderbilt,3435,12,Z,True,0.0829,0.0248,0.0076,0.0033,0.0014,0.0008
62,Virginia Tech,3439,4,Z,False,0.8706,0.3668,0.0691,0.0228,0.0065,0.0034


In [16]:
df.loc[df['Region'].isna(), :]

Unnamed: 0,Team,TeamID,Seed,Region,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6


Redistribute play-in probabilities to the teams that won

Unfortunately, it is unclear which play-in game the probabilities are from outside of the seeding, so I will split it evenly

In [None]:
# for seed in df.loc[df['Play In'], 'Seed'].unique():
#     df.loc[
#         (~df['Team'].str.contains('^playin', regex=True)) & 
#         (df['Seed'] == seed) &
#         (df['Play In']), 
#         [f'Round {i}' for i in range(1, 7)]
#     ] += df.loc[
#         (df['Team'].str.contains('^playin', regex=True)) & 
#         (df['Seed'] == seed) &
#         (df['Play In']), 
#         [f'Round {i}' for i in range(1, 7)]
#     ].mean(axis=0)

# df = df.loc[
#     ~df['Team'].str.contains('^playin', regex=True), 
#     :
# ].reset_index(drop=True)

# df['TeamID'] = df['TeamID'].astype(int)

# df.loc[df['Play In'], :]

Redistribute percentages to account for rounding inaccuracies

In [None]:
# for i in range(1, 7):
#     df[f'Round {i}'] = df[f'Round {i}'] / df[f'Round {i}'].sum() * 2**(6 - i)

# df

In [17]:
df.sum()

Team       ArizonaAlabamaBaylorCalifornia BaptistChattano...
TeamID                                                211005
Seed                                                     544
Region     ZXZYXYZYYXZWWWXXXYWYXZZYWYYZWZXZWYWWXWXWZWWYWY...
Play In                                                    4
Round 1                                              31.9061
Round 2                                              15.9563
Round 3                                               7.9779
Round 4                                               3.9889
Round 5                                               1.9991
Round 6                                               1.0007
dtype: object

In [18]:
df.insert(df.columns.get_loc('Seed'), 'Region Seed', df['Region'] + df['Seed'].astype(str).str.zfill(2))

df

Unnamed: 0,Team,TeamID,Region Seed,Seed,Region,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Arizona,3112,Z11,11,Z,True,0.1524,0.0182,0.0066,0.0027,0.0011,0.0005
1,Alabama,3104,X08,8,X,False,0.5112,0.0351,0.0151,0.0058,0.0023,0.0010
2,Baylor,3124,Z05,5,Z,False,0.9412,0.5772,0.1278,0.0305,0.0067,0.0022
3,California Baptist,3465,Y15,15,Y,False,0.0320,0.0136,0.0049,0.0019,0.0008,0.0004
4,Chattanooga,3151,X14,14,X,False,0.0729,0.0226,0.0071,0.0027,0.0011,0.0006
...,...,...,...,...,...,...,...,...,...,...,...,...
59,USC,3425,Z01,1,Z,False,0.9674,0.8767,0.7212,0.3971,0.0898,0.0303
60,Utah,3428,X05,5,X,False,0.8490,0.3177,0.0555,0.0145,0.0039,0.0015
61,Vanderbilt,3435,Z12,12,Z,True,0.0829,0.0248,0.0076,0.0033,0.0014,0.0008
62,Virginia Tech,3439,Z04,4,Z,False,0.8706,0.3668,0.0691,0.0228,0.0065,0.0034


In [19]:
df.to_csv(f'../data/preprocessed/womens_yahoo/{season}_womens_yahoo_picks.csv', index=False)

'Done'

'Done'