In [8]:
import pandas as pd
from data_cleaning import (
    add_victory_margin,
    clean_results_df,
    filter_results_df,
    final_format_all_am,
    final_format_top_two,
    get_meters,
    get_points,
    pivot_to_one_row,
)
from web_scrape_results import get_event_name, scrape_results

In [9]:
import warnings

warnings.filterwarnings('ignore')

In [10]:
all_events = pd.DataFrame()
urls = [
    'https://flashresults.ncaa.com/Outdoor/2024/019-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/014-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/020-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/015-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/017-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/039-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/034-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/040-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/035-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/037-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/013-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/018-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/016-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/033-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/038-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/036-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/041_Scores.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/042_Scores.htm'
]
for url in urls:
    df = scrape_results(url)
    event_name = get_event_name(url)
    print(event_name)
    df['event'] = event_name
    top_two = filter_results_df(df, ["1", "2"])
    top_two = clean_results_df(top_two, event_name)
    all_am_filter = min('8',df.dropna().Pl.max())
    all_am = filter_results_df(df, ["1", all_am_filter])
    if all_am_filter != '8':
        all_am.loc[all_am["Pl"] == all_am_filter, "Pl"] = "8"
    all_am = clean_results_df(all_am, event_name)
    if "athlon" in event_name:
        top_two = get_points(top_two)
        all_am = get_points(all_am)
    else:
        top_two = get_meters(top_two)
        all_am = get_meters(all_am)
    top_two = add_victory_margin(top_two, col_name='Mark')
    all_am = add_victory_margin(all_am, 'Mark')
    pivot_df_top_two = pivot_to_one_row(top_two)
    pivot_df_all_am = pivot_to_one_row(all_am)
    final_df_top_two = final_format_top_two(pivot_df_top_two, col_name='Mark')
    final_df_all_am = final_format_all_am(pivot_df_all_am, col_name='Mark')
    final_df = final_df_top_two.merge(final_df_all_am, on = ['Event','1st place name','1st place mark'])
    try:
        all_events = pd.concat([all_events, final_df])
    except TypeError:  # Catch the specific exception for when all_events is not a list or DataFrame
        all_events = final_df  # Initialize all_events with final_df

Men Hammer
Men Pole Vault
Men Javelin
Men Long Jump
Men Shot Put
Women Hammer
Women Pole Vault
Women Javelin
Women Long Jump
Women Shot Put
Men High Jump
Men Discus
Men Triple Jump
Women High Jump
Women Discus
Women Triple Jump
Decathlon Standings
Heptathlon Standings


In [11]:
# Interpretation: If we add 10.2% to Leo Neugebauer's point total, we would get Peyton Bair's point total.
# Aka - Leo beat Peyton by 10.2%.
# Calculation: (8961.00-8131.00)/8131.00
# Margin of victory divided by second place points
all_events.sort_values(by = 'Margin of victory (%)', ascending=False)

Unnamed: 0,Event,1st place name,2nd place name,1st place mark,2nd place mark,Margin of victory (m),Margin of victory (%),8th place name,8th place mark,All American Spread (m),All American Spread (%)
0,Decathlon Standings,Leo Neugebaue,Peyton Bai,8961.0,8131.0,830.0,0.102078,Yariel Sot,7804.0,1157.0,0.148257
0,Women Javelin,Rhema Otabor,Lianna Davidson,64.19,60.7,3.49,0.057496,Deisiane Teixeira,54.54,9.65,0.176934
0,Women Shot Put,Jaida Ross,Gabby Morris,19.57,18.66,0.91,0.048767,Nina Ndubuisi,17.15,2.42,0.141108
0,Women Triple Jump,Ackelia Smith,Darja Sopova,14.52,14.01,0.51,0.036403,Ruta Lasmane,13.5,1.02,0.075556
0,Women Pole Vault,Chloe Timberg,Riley Felts,4.71,4.55,0.16,0.035165,Olivia Lueking,4.3,0.41,0.095349
0,Men Discus,Francois Prinsloo,Racquil Broderick,63.51,61.77,1.74,0.028169,Dallin Shurtsby,59.04,4.47,0.075711
0,Men Shot Put,Tarik Robinso,Jason Swarens,20.88,20.38,0.5,0.024534,Kevin Shubert,19.63,1.25,0.063678
0,Men Long Jump,Chrstyn,Jeremiah Davis,8.22,8.07,0.15,0.018587,Nikaoli Williams,7.76,0.46,0.059278
0,Men Javelin,Marc Minichello,Chandler Ault,80.7,79.31,1.39,0.017526,Cameron Batesby,72.09,8.61,0.119434
0,Heptathlon Standings,Timara Chapma,Jadin Obrie,6339.0,6234.0,105.0,0.016843,Annika William,5832.0,507.0,0.086934


In [12]:
csv_file_path = 'field.csv'
all_events.to_csv(csv_file_path, index=False) 