# Get Yahoo Public Picks

Used to compare my picks with general public concensus. Assumes yahoo data has been exported to Excel. 

In [1]:
season = 2024

season

2024

In [2]:
import pandas as pd

pd.set_option('display.max_columns', 100)

df = pd.concat(
    [
        pd.read_excel(fr'..\data\unprocessed\yahoo\{season}_yahoo_picks.xlsx', sheet_name=f'round_{round_}')
        .assign(Round=round_)
        for round_ in range(1, 7)
    ],
    ignore_index=True,
)

df['Seed'] = df['Team (Seed)'].str.extract(r'\((\d+)\)').astype(int)
df['Team (Seed)'] = df['Team (Seed)'].str.replace(r'\(\d+\)', '', regex=True)

df.rename(columns={'Team (Seed)': 'Team'}, inplace=True)

df = df.pivot(index=['Team', 'Seed'], columns=['Round'], values='% Picked').reset_index()
df.columns = ['Team', 'Seed', 'Round 1', 'Round 2', 'Round 3', 'Round 4', 'Round 5', 'Round 6']

df

Unnamed: 0,Team,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Akron,14,0.0716,0.0178,0.0049,0.0019,0.0009,0.0004
1,Alabama,4,0.8711,0.5573,0.1355,0.0519,0.0141,0.0055
2,Arizona,2,0.9519,0.8492,0.5984,0.3025,0.1005,0.0478
3,Auburn,4,0.9017,0.6467,0.1216,0.0731,0.0364,0.0173
4,BSU/COLO,10,0.2297,0.0342,0.0077,0.0024,0.0016,0.0005
...,...,...,...,...,...,...,...,...
59,W. Kentucky,15,0.0533,0.0218,0.0048,0.0018,0.0007,0.0003
60,Wagner,16,0.0242,0.0108,0.0056,0.0031,0.0013,0.0006
61,Washington St.,7,0.4263,0.0871,0.0252,0.0051,0.0021,0.0010
62,Wisconsin,5,0.6625,0.2661,0.0766,0.0313,0.0113,0.0048


Fix play-ins

In [3]:
df.loc[df['Team'].str.contains('/', regex=False), :]

Unnamed: 0,Team,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
4,BSU/COLO,10,0.2297,0.0342,0.0077,0.0024,0.0016,0.0005
29,MTST/GRAM,16,0.0173,0.0067,0.0033,0.0018,0.0007,0.0003


In [4]:
df.loc[df['Team'] == 'BSU/COLO', 'Team'] = 'COLO'
df.loc[df['Team'] == 'HOW/WAG', 'Team'] = 'HOW'
df.loc[df['Team'] == 'MTST/GRAM', 'Team'] = 'MTST'
df.loc[df['Team'] == 'UVA/CSU', 'Team'] = 'CSU'

df.loc[df['Team'].str.contains('/', regex=False), :]

Unnamed: 0,Team,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6


Map with Kaggle data

In [5]:
df_seeds = pd.read_csv(r'..\data\unprocessed\kaggle\MNCAATourneySeeds.csv')

df_seeds = df_seeds.loc[df_seeds['Season'] == season, :].reset_index(drop=True)

df_seeds.insert(2, 'Play In', df_seeds['Seed'].str.endswith(('a', 'b')))
df_seeds.insert(2, 'Region', df_seeds['Seed'].str[0])
df_seeds['Seed'] = df_seeds['Seed'].str.extract('(\d+)').astype(int)

df_seeds

Unnamed: 0,Season,Seed,Region,Play In,TeamID
0,2024,1,W,False,1163
1,2024,2,W,False,1235
2,2024,3,W,False,1228
3,2024,4,W,False,1120
4,2024,5,W,False,1361
...,...,...,...,...,...
63,2024,12,Z,False,1241
64,2024,13,Z,False,1436
65,2024,14,Z,False,1324
66,2024,15,Z,False,1443


In [6]:
df_spellings = pd.read_csv(
    r'..\data\unprocessed\kaggle\MTeamSpellings.csv', 
    encoding='cp1252'  # fixes issue with fancy quotes
)

df_spellings.loc[df_spellings.shape[0]] = ['fdu', 1192]
df_spellings.loc[df_spellings.shape[0]] = ['sdsu', 1361]
df_spellings.loc[df_spellings.shape[0]] = ['csu', 1161]

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101
...,...,...
1163,youngstown-st,1464
1164,youngstown-state,1464
1165,fdu,1192
1166,sdsu,1361


In [7]:
df_spellings = pd.merge(
    df_spellings,
    df_seeds[['TeamID', 'Seed']],
    how='inner',
    on=['TeamID']
)

df_spellings

Unnamed: 0,TeamNameSpelling,TeamID,Seed
0,akron,1103,14
1,alabama,1104,4
2,alabama-birmingham,1412,12
3,uab,1412,12
4,arizona,1112,2
...,...,...,...
173,washington state,1450,7
174,washington-st,1450,7
175,washington-state,1450,7
176,wisconsin,1458,5


In [8]:
from fuzzywuzzy.fuzz import token_sort_ratio
from fuzzywuzzy import process
from tqdm.autonotebook import tqdm

team_spellings = df_spellings['TeamNameSpelling'].unique()
yahoo_teams = df.loc[~df['Team'].str.contains('^playin', regex=True), 'Team'].unique()

df_match = pd.DataFrame(
    [
        [
            yahoo_team,
            *process.extract(
                yahoo_team,
                team_spellings,
                scorer=token_sort_ratio,
                limit=1
            )[0][:2]
        ] for yahoo_team in tqdm(yahoo_teams)
    ],
    columns=['Yahoo Team', 'Team Spelling', 'Match Score']
).sort_values('Match Score', ignore_index=True)

df_match.head(25)

  from tqdm.autonotebook import tqdm


  0%|          | 0/64 [00:00<?, ?it/s]

Unnamed: 0,Yahoo Team,Team Spelling,Match Score
0,MTST,montana st,57
1,COLO,colorado,67
2,N. Carolina,north carolina,83
3,Akron,akron,100
4,N.C. State,n.c. state,100
5,Nebraska,nebraska,100
6,Nevada,nevada,100
7,New Mexico,new mexico,100
8,Northwestern,northwestern,100
9,Oakland,oakland,100


In [9]:
yahoo_to_spelling = dict(zip(df_match['Yahoo Team'], df_match['Team Spelling']))
spelling_to_id = dict(zip(df_spellings['TeamNameSpelling'], df_spellings['TeamID']))

df.insert(1, 'TeamID', df['Team'].map(yahoo_to_spelling).map(spelling_to_id))

df

Unnamed: 0,Team,TeamID,Seed,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Akron,1103,14,0.0716,0.0178,0.0049,0.0019,0.0009,0.0004
1,Alabama,1104,4,0.8711,0.5573,0.1355,0.0519,0.0141,0.0055
2,Arizona,1112,2,0.9519,0.8492,0.5984,0.3025,0.1005,0.0478
3,Auburn,1120,4,0.9017,0.6467,0.1216,0.0731,0.0364,0.0173
4,COLO,1160,10,0.2297,0.0342,0.0077,0.0024,0.0016,0.0005
...,...,...,...,...,...,...,...,...,...
59,W. Kentucky,1443,15,0.0533,0.0218,0.0048,0.0018,0.0007,0.0003
60,Wagner,1447,16,0.0242,0.0108,0.0056,0.0031,0.0013,0.0006
61,Washington St.,1450,7,0.4263,0.0871,0.0252,0.0051,0.0021,0.0010
62,Wisconsin,1458,5,0.6625,0.2661,0.0766,0.0313,0.0113,0.0048


In [10]:
import numpy as np

id_to_playin = dict(zip(df_seeds['TeamID'], df_seeds['Play In']))
id_to_playin[np.nan] = True

df.insert(3, 'Play In', df['TeamID'].map(id_to_playin))

df

Unnamed: 0,Team,TeamID,Seed,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Akron,1103,14,False,0.0716,0.0178,0.0049,0.0019,0.0009,0.0004
1,Alabama,1104,4,False,0.8711,0.5573,0.1355,0.0519,0.0141,0.0055
2,Arizona,1112,2,False,0.9519,0.8492,0.5984,0.3025,0.1005,0.0478
3,Auburn,1120,4,False,0.9017,0.6467,0.1216,0.0731,0.0364,0.0173
4,COLO,1160,10,True,0.2297,0.0342,0.0077,0.0024,0.0016,0.0005
...,...,...,...,...,...,...,...,...,...,...
59,W. Kentucky,1443,15,False,0.0533,0.0218,0.0048,0.0018,0.0007,0.0003
60,Wagner,1447,16,True,0.0242,0.0108,0.0056,0.0031,0.0013,0.0006
61,Washington St.,1450,7,False,0.4263,0.0871,0.0252,0.0051,0.0021,0.0010
62,Wisconsin,1458,5,False,0.6625,0.2661,0.0766,0.0313,0.0113,0.0048


In [11]:
id_to_region = dict(zip(df_seeds['TeamID'], df_seeds['Region']))

df.insert(3, 'Region', df['TeamID'].map(id_to_region))

df

Unnamed: 0,Team,TeamID,Seed,Region,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Akron,1103,14,Y,False,0.0716,0.0178,0.0049,0.0019,0.0009,0.0004
1,Alabama,1104,4,X,False,0.8711,0.5573,0.1355,0.0519,0.0141,0.0055
2,Arizona,1112,2,X,False,0.9519,0.8492,0.5984,0.3025,0.1005,0.0478
3,Auburn,1120,4,W,False,0.9017,0.6467,0.1216,0.0731,0.0364,0.0173
4,COLO,1160,10,Z,True,0.2297,0.0342,0.0077,0.0024,0.0016,0.0005
...,...,...,...,...,...,...,...,...,...,...,...
59,W. Kentucky,1443,15,Z,False,0.0533,0.0218,0.0048,0.0018,0.0007,0.0003
60,Wagner,1447,16,X,True,0.0242,0.0108,0.0056,0.0031,0.0013,0.0006
61,Washington St.,1450,7,W,False,0.4263,0.0871,0.0252,0.0051,0.0021,0.0010
62,Wisconsin,1458,5,Z,False,0.6625,0.2661,0.0766,0.0313,0.0113,0.0048


In [12]:
df.loc[df['Region'].isna(), :]

Unnamed: 0,Team,TeamID,Seed,Region,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6


Redistribute play-in probabilities to the teams that won

If using 2023, it is unclear which play-in teams are which

In [13]:
# for seed in df.loc[df['Play In'], 'Seed'].unique():
#     df.loc[
#         (~df['Team'].str.contains('^playin', regex=True)) & 
#         (df['Seed'] == seed) &
#         (df['Play In']), 
#         [f'Round {i}' for i in range(1, 7)]
#     ] += df.loc[
#         (df['Team'].str.contains('^playin', regex=True)) & 
#         (df['Seed'] == seed) &
#         (df['Play In']), 
#         [f'Round {i}' for i in range(1, 7)]
#     ].mean(axis=0)

# df = df.loc[
#     ~df['Team'].str.contains('^playin', regex=True), 
#     :
# ].reset_index(drop=True)

# df['TeamID'] = df['TeamID'].astype(int)

# df.loc[df['Play In'], :]

Redistribute percentages to account for rounding inaccuracies

In [14]:
# for i in range(1, 7):
#     df[f'Round {i}'] = df[f'Round {i}'] / df[f'Round {i}'].sum() * 2**(6 - i)

# df

In [15]:
df.sum()

Team       AkronAlabamaArizonaAuburnCOLOBYUBaylorCharlest...
TeamID                                                 81948
Seed                                                     544
Region     YXXWZWXXXXYWYXWZWZWYXZWWZYZXZYZYXXWXZZXXWZYYWW...
Play In                                                    4
Round 1                                              31.6327
Round 2                                              15.8578
Round 3                                               7.9392
Round 4                                               3.9744
Round 5                                               1.9989
Round 6                                               1.0002
dtype: object

In [16]:
df.insert(df.columns.get_loc('Seed'), 'Region Seed', df['Region'] + df['Seed'].astype(str).str.zfill(2))

df

Unnamed: 0,Team,TeamID,Region Seed,Seed,Region,Play In,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6
0,Akron,1103,Y14,14,Y,False,0.0716,0.0178,0.0049,0.0019,0.0009,0.0004
1,Alabama,1104,X04,4,X,False,0.8711,0.5573,0.1355,0.0519,0.0141,0.0055
2,Arizona,1112,X02,2,X,False,0.9519,0.8492,0.5984,0.3025,0.1005,0.0478
3,Auburn,1120,W04,4,W,False,0.9017,0.6467,0.1216,0.0731,0.0364,0.0173
4,COLO,1160,Z10,10,Z,True,0.2297,0.0342,0.0077,0.0024,0.0016,0.0005
...,...,...,...,...,...,...,...,...,...,...,...,...
59,W. Kentucky,1443,Z15,15,Z,False,0.0533,0.0218,0.0048,0.0018,0.0007,0.0003
60,Wagner,1447,X16,16,X,True,0.0242,0.0108,0.0056,0.0031,0.0013,0.0006
61,Washington St.,1450,W07,7,W,False,0.4263,0.0871,0.0252,0.0051,0.0021,0.0010
62,Wisconsin,1458,Z05,5,Z,False,0.6625,0.2661,0.0766,0.0313,0.0113,0.0048


In [17]:
df.to_csv(f'../data/preprocessed/yahoo/{season}_yahoo_picks.csv', index=False)

'Done'

'Done'