## Imports

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

import pandas as pd

## Set up driver and dataframe

In [269]:
driver = webdriver.Chrome()

In [314]:
df = pd.DataFrame(columns=[
    'YEAR',
    'NAME',
    'SCHOOL',
    'EVENT_DIST',
    'EVENT_TYPE',
    'FINAL_PLACE',
    'TOTAL',
] + [
    str(i * 50) + "_SPLIT" for i in range(1, 34)
])

## Dataset for NCAA meet results

In [315]:
MEET_IDS = ['194775', '194787', '266583']
YEARS = [2022, 2023, 2024]
EVENT_NUMS = [1, 2, 1]

In [316]:
for i, meet_id in enumerate(MEET_IDS):
    year = YEARS[i]
    
    driver.get("https://www.swimcloud.com/results/" + meet_id + "/event/" + str(EVENT_NUMS[i]) + "/0/?")
    driver.find_element(By.CSS_SELECTOR, '.btn-link.u-text-large.u-link-text.dropdown-toggle').click()

    events_elt = driver.find_element(By.CSS_SELECTOR, '.dropdown.open').find_element(By.CLASS_NAME, "dropdown-menu")
    links = []
    for link_tag in events_elt.find_elements(By.TAG_NAME, 'a'):
        links.append(link_tag.get_attribute('href'))

    for link in links:
        driver.get(link)
        event_dist, event_type = driver.find_element(By.CSS_SELECTOR, '.btn-link.u-text-large.u-link-text.dropdown-toggle').text.split(' ', 1)
        result_elts = driver.find_elements(By.TAG_NAME, 'tr')
        results_elts = (result_elts[1:9] + result_elts[10:18]) if result_elts[2].text else (result_elts[1:37:5] + result_elts[42:78:5])
        data = {}
        for j, result_elt in enumerate(results_elts):
            if event_dist == '800' and j == 8:
                break
            place = result_elt.find_element(By.CSS_SELECTOR, '.c-table-clean__col-fit.u-text-center.u-pr0').text
            place = int(place) if place.strip().replace('–', '').isdigit() or place == '-' else None
            name = result_elt.find_element(By.CSS_SELECTOR, '.u-nowrap.u-text-semi').find_element(By.TAG_NAME, 'a').text
            try:
                school = result_elt.find_element(By.CSS_SELECTOR, '.u-nowrap.u-text-semi').find_element(By.CSS_SELECTOR, '.u-color-mute.u-text-xsmall.visible-xs-block').text
            except NoSuchElementException:
                school = name[:-4]
            total = result_elt.find_element(By.CSS_SELECTOR, '.u-text-end.u-nowrap').text if 'M' not in event_dist else result_elt.find_element(By.CSS_SELECTOR, '.u-text-end.u-color-mute').text 
            splits = [0] * 33
            if place and 'M' not in event_dist: 
                splits_link = result_elt.find_element(By.CSS_SELECTOR, '.btn-link.u-p0.js-time-popover').get_attribute('data-url')
                driver.get('https://swimcloud.com' + splits_link)
                for i, time_elt in enumerate(driver.find_element(By.TAG_NAME, 'tbody').find_elements(By.CLASS_NAME, 'u-text-end')):
                    if i % 3 == 1 and 'Relay' not in event_type:
                        splits[i // 3] = float(time_elt.text) if time_elt.text != '–' else 0
                    elif i % 4 == 1 and 'Relay' in event_type:
                        splits[i // 4] = float(time_elt.text) if time_elt.text != '–' else 0
                driver.back()
            data['YEAR'] = year
            data['NAME'] = name
            data['SCHOOL'] = school
            data['EVENT_DIST'] = event_dist
            data['EVENT_TYPE'] = event_type
            data['FINAL_PLACE'] = place
            data['TOTAL'] = total

            for i in range(1, 34):
                data[str(i * 50) + "_SPLIT"] = splits[i - 1]
            df.loc[len(df)] = data
        

In [317]:
df

Unnamed: 0,YEAR,NAME,SCHOOL,EVENT_DIST,EVENT_TYPE,FINAL_PLACE,TOTAL,50_SPLIT,100_SPLIT,150_SPLIT,...,1200_SPLIT,1250_SPLIT,1300_SPLIT,1350_SPLIT,1400_SPLIT,1450_SPLIT,1500_SPLIT,1550_SPLIT,1600_SPLIT,1650_SPLIT
0,2022,Kevin Gillooly,Rowan,50,Free,1,19.50,19.50,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022,Nick Goudie,Emory,50,Free,2,19.76,19.76,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022,Chris Schiavone,Franklin & Marshall,50,Free,3,19.96,19.96,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022,Trey Ike,Denison,50,Free,4,19.97,19.97,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022,JULIAN ITURBE,Calvin,50,Free,5,20.02,20.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
931,2024,Luke Albanese,RIT,3M,Diving,12,480.20,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
932,2024,Nick Fogle,Denison,3M,Diving,13,445.05,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
933,2024,Reid Omilian,Chapman,3M,Diving,14,425.65,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
934,2024,Thomas Cable,Merchant Marine,3M,Diving,15,424.35,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [318]:
df.to_pickle("ncaa_men_swive_results.pkl")

## Data Analytics

In [322]:
df = pd.read_pickle("ncaa_men_swive_results.pkl")

In [326]:
df.dtypes

YEAR             int64
NAME            object
SCHOOL          object
EVENT_DIST      object
EVENT_TYPE      object
FINAL_PLACE     object
TOTAL           object
50_SPLIT       float64
100_SPLIT      float64
150_SPLIT      float64
200_SPLIT      float64
250_SPLIT      float64
300_SPLIT      float64
350_SPLIT      float64
400_SPLIT      float64
450_SPLIT      float64
500_SPLIT      float64
550_SPLIT      float64
600_SPLIT      float64
650_SPLIT      float64
700_SPLIT      float64
750_SPLIT      float64
800_SPLIT      float64
850_SPLIT      float64
900_SPLIT      float64
950_SPLIT      float64
1000_SPLIT     float64
1050_SPLIT     float64
1100_SPLIT     float64
1150_SPLIT     float64
1200_SPLIT     float64
1250_SPLIT     float64
1300_SPLIT     float64
1350_SPLIT     float64
1400_SPLIT     float64
1450_SPLIT     float64
1500_SPLIT     float64
1550_SPLIT     float64
1600_SPLIT     float64
1650_SPLIT     float64
dtype: object