In [7]:
import pandas as pd
from data_cleaning import (
    add_victory_margin,
    clean_results_df,
    filter_results_df,
    final_format_all_am,
    final_format_top_two,
    get_meters,
    get_points,
    pivot_to_one_row,
)
from web_scrape_results import get_event_name, scrape_results

In [8]:
import warnings

warnings.filterwarnings('ignore')

In [9]:
all_events = pd.DataFrame()
urls = [
    'https://flashresults.ncaa.com/Outdoor/2024/019-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/014-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/020-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/015-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/017-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/039-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/034-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/040-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/035-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/037-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/013-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/018-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/016-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/033-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/038-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/036-1_compiled.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/041_Scores.htm',
    'https://flashresults.ncaa.com/Outdoor/2024/042_Scores.htm'
]
for url in urls:
    df = scrape_results(url)
    event_name = get_event_name(url)
    print(event_name)
    df['event'] = event_name
    top_two = filter_results_df(df, ["1", "2"])
    top_two = clean_results_df(top_two, event_name)
    all_am_filter = min('8',df.dropna().Pl.max())
    all_am = filter_results_df(df, ["1", all_am_filter])
    if all_am_filter != '8':
        all_am.loc[all_am["Pl"] == all_am_filter, "Pl"] = "8"
    all_am = clean_results_df(all_am, event_name)
    if "athlon" in event_name:
        top_two = get_points(top_two)
        all_am = get_points(all_am)
    else:
        top_two = get_meters(top_two)
        all_am = get_meters(all_am)
    top_two = add_victory_margin(top_two, col_name='Mark')
    all_am = add_victory_margin(all_am, 'Mark')
    pivot_df_top_two = pivot_to_one_row(top_two)
    pivot_df_all_am = pivot_to_one_row(all_am)
    final_df_top_two = final_format_top_two(pivot_df_top_two, col_name='Mark')
    final_df_all_am = final_format_all_am(pivot_df_all_am, col_name='Mark')
    final_df = final_df_top_two.merge(final_df_all_am, on = ['Event','1st place name','1st place mark'])
    try:
        all_events = pd.concat([all_events, final_df])
    except TypeError:  # Catch the specific exception for when all_events is not a list or DataFrame
        all_events = final_df  # Initialize all_events with final_df

Men Hammer
Men Pole Vault
Men Javelin
Men Long Jump
Men Shot Put
Women Hammer
Women Pole Vault
Women Javelin
Women Long Jump
Women Shot Put
Men High Jump
Men Discus
Men Triple Jump
Women High Jump
Women Discus
Women Triple Jump
Decathlon Standings
Heptathlon Standings


In [11]:
# Interpretation: If we add 5.5% to Devoux Deysel's distance, we would get Chandler Ault's throw.
# Aka - Chandler beat Devoux by 5.5%.
# Calculation: (79.31-75.14)/75.14
# Margin of victory divided by second place distance
all_events.sort_values(by = 'Margin of victory (%)', ascending=False)

Unnamed: 0,Event,1st place name,2nd place name,1st place mark,2nd place mark,Margin of victory (m),Margin of victory (%),8th place name,8th place mark,All American Spread (m),All American Spread (%)
0,Men Javelin,Chandler Ault,Devoux Deysel,79.31,75.14,4.17,0.055496,Sam Hankins,71.77,7.54,0.105058
0,Women Javelin,Lianna Davidson,Mckyla Va,60.7,57.51,3.19,0.055469,Kayla Thorpe,53.44,7.26,0.135853
0,Men Triple Jump,Russell Robinson,Brandon Gree,17.13,16.63,0.5,0.030066,Terrol Wilson,16.21,0.92,0.056755
0,Women Shot Put,Gabby Morris,Axelina Johansson,18.66,18.24,0.42,0.023026,Marilyn Nwora,17.12,1.54,0.089953
0,Men Hammer,Kenneth Ikeji,Angelos Mantzouranis,77.12,75.5,1.62,0.021457,Jeremiah Nubbe,71.17,5.95,0.083603
0,Men Pole Vault,Clayton Simms,Christyan Sampy,5.62,5.52,0.1,0.018116,Conner Mcclure,5.37,0.25,0.046555
0,Heptathlon Standings,Jadin Obrie,Kristine Blazevic,6234.0,6126.0,108.0,0.01763,Jenna,5812.0,422.0,0.072608
0,Women Long Jump,Claire Bryant,Alyssa Jones,6.74,6.64,0.1,0.01506,Robbie Grace,6.31,0.43,0.068146
0,Men Discus,Racquil Broderick,Dimitrios Pavlidis,61.77,60.97,0.8,0.013121,Michael Pinckneyucl,58.98,2.79,0.047304
0,Women Pole Vault,Riley Felts,Hana Moll,4.55,4.5,0.05,0.011111,Payton Phillips,4.3,0.25,0.05814


In [12]:
csv_file_path = 'field.csv'
all_events.to_csv(csv_file_path, index=False) 