In [169]:
#!pip install selenium
#!pip install webdriver-manager


In [170]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains

import time

import pandas as pd

In [171]:
options = webdriver.ChromeOptions()
#options.add_argument('--headless')  # Ejecuta Chrome en modo headless 
# (es útil cuando ya estamos seguros de que el código sirve)


In [172]:
driver = webdriver.Chrome()
driver.get('https://chicago-history.r.mikatiming.com/2023/?lang=EN_CAP&pid=start&pidp=start')
time.sleep(1)

year = 2022
#age_group = "All"  
gender = 'W' #W / D (NB) / N (Not specified)
results_per_page = 1000 

In [173]:
def extract_information():
    """Extrae la información de la página de resultados."""
    try:
        # Información del evento
        event = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-event_name')]//td").text
        year = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-event_date')]//td").text

        # Datos del participante
        name_ctz = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-__fullname')]//td").text
        bib_number = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-start_no')]//td").text
        city_state = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-__city_state')]//td").text
        division = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-_type_age_class')]//td").text

        # Tiempo y posición
        finish_time = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_finish_netto')]//td").text
        place_gender = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-place_all')]//td").text
        place_overall = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-place_nosex')]//td").text
        place_age_group = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-place_age')]//td").text

        # Divisiones de tiempo
        split_5k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_05')]//td[3]").text
        split_10k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_10')]//td[3]").text
        split_15k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_15')]//td[3]").text
        split_20k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_20')]//td[3]").text
        split_half = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_52')]//td[3]").text
        split_25k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_25')]//td[3]").text
        split_30k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_30')]//td[3]").text
        split_35k = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_35')]//td[3]").text
        split_finish = driver.find_element(By.XPATH, "//tr[contains(@class, 'f-time_finish_netto')]//td[3]").text

        return {
            "event": event,
            "year": year,
            "bib": bib_number,
            "name": name_ctz,
            "city/state": city_state,
            "division": division,
            "final_time": finish_time,
            "gender_position": place_gender,
            "overall_position": place_overall,
            "age_group_position": place_age_group,
            "5k": split_5k,
            "10k": split_10k,
            "15k": split_15k,
            "20k": split_20k,
            "half": split_half,
            "25k": split_25k,
            "30k": split_30k,
            "35k": split_35k,
            "finish": split_finish,
        }

    except Exception as e:
        print(f"Error extrayendo información: {e}")
        return None


In [174]:
def configure_filters(driver, year, gender, results_per_page):
    """
    Configura los filtros en la página principal antes de iniciar la extracción.
    """
    try:
        # Seleccionar el año
        year_select = Select(driver.find_element(By.ID, "default-lists-event_main_group"))
        year_select.select_by_value(str(year))  
        time.sleep(5)

        # Seleccionar el gender
        gender_select = Select(driver.find_element(By.ID, "default-lists-sex"))
        gender_select.select_by_value(gender) 
        time.sleep(3)

        # Seleccionar la cantidad de resultados por página
        results_per_page_select = Select(driver.find_element(By.ID, "default-num_results"))
        results_per_page_select.select_by_value(str(results_per_page))  

        # Hacer clic en el botón de "Show Results"
        submit_button = driver.find_element(By.ID, "default-submit")
        submit_button.click()

        # Esperar un momento para que los resultados carguen
        time.sleep(1)
    except Exception as e:
        print(f"Error configurando los filtros: {e}")
        driver.quit()
        raise


In [175]:
configure_filters(driver, year, gender, results_per_page)

data = []

# Iterar sobre las páginas
while True:
    try:
        # Encontrar los elementos de los nombres
        names_elements = driver.find_elements(By.CSS_SELECTOR, "h4.list-field.type-fullname a")

        # Iterar sobre cada nombre, hacer clic y extraer información
        for i in range(len(names_elements)):
            try:
                # Volver a encontrar los elementos después de cada interacción
                names_elements = driver.find_elements(By.CSS_SELECTOR, "h4.list-field.type-fullname a")

                # Scroll hacia el elemento
                ActionChains(driver).move_to_element(names_elements[i]).perform()

                # Clic en el nombre
                names_elements[i].click()

                # Extraer la información
                participant_data = extract_information()
                if participant_data:
                    data.append(participant_data)

                # Volver a la página principal
                driver.back()
            except Exception as e:
                print(f"Error procesando un participante: {e}")
                driver.back()

        # Intentar encontrar y hacer clic en el botón de la siguiente página
        next_button = driver.find_element(By.CSS_SELECTOR, "ul.pagination li.pages-nav-button:last-child a")
        if next_button:
            next_button.click()
            time.sleep(1)  
        else:
            print("No hay más páginas disponibles.")
            break
    except Exception as e:
        print(f"Error procesando la página: {e}")
        break

# Guardar los datos en un archivo CSV
df = pd.DataFrame(data)
df.to_csv('resultados_maraton.csv', index=False)

# Cerrar el navegador
driver.quit()


Error extrayendo información: 'NoneType' object has no attribute 'text'
Error procesando un participante: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.265)
Stacktrace:
0   chromedriver                        0x000000010509e138 cxxbridge1$str$ptr + 3653888
1   chromedriver                        0x0000000105096988 cxxbridge1$str$ptr + 3623248
2   chromedriver                        0x0000000104afc968 cxxbridge1$string$len + 89228
3   chromedriver                        0x0000000104ad7e44 core::str::slice_error_fail::ha0e52dbcb60e6bae + 3780
4   chromedriver                        0x0000000104b66c84 cxxbridge1$string$len + 524200
5   chromedriver                        0x0000000104b79b60 cxxbridge1$string$len + 601732
6   chromedriver                        0x0000000104b35564 cxxbridge1$string$len + 321672
7   chromedriver                        0x0000000104b361b4 cxxbridge1$string$len + 324824
8   chromed

In [176]:
pd.DataFrame(data)


Unnamed: 0,bib,name,city/state,division,final_time,gender_position,overall_position,age_group_position,5k,10k,15k,20k,half,25k,30k,35k,finish
0,101,"Chepngetich, Ruth (KEN)",Ngong,25-29,02:14:18,1,25,1,15:11,15:29,15:39,15:51,03:34,12:19,15:58,16:24,07:16
1,107,"Sisson, Emily (USA)",Phoenix,30-34,02:18:29,2,35,1,16:23,16:31,16:23,16:32,03:37,12:43,16:28,16:33,07:00


In [177]:
# def load_all_elements():
#     """Carga todos los elementos desplazándose hacia abajo."""
#     last_height = driver.execute_script("return document.body.scrollHeight")
#     while True:
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(5)  # Espera a que se carguen más elementos
#         new_height = driver.execute_script("return document.body.scrollHeight")
#         if new_height == last_height:
#             break
#         last_height = new_height