# Results history

Gérer les changements de nationalité

## Imports

In [1]:
import numpy as np
import polars as pl
import plotly.io as pio
from datetime import datetime


from colibri_src.utils import figures, dates


pio.renderers.default = "firefox"


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

ModuleNotFoundError: No module named 'colibri_src'

## Functions

In [2]:
def get_results_alltime_athletes_current(
    list_years_current=[2025, 2024, 2023], sex="m"
):
    """
    Get all results of current athletes and concat them into an unique dataframe
    """

    ##Get list of current athletes
    data = []
    for year in list_years_current:
        df_ranking_overall_year = pl.read_parquet(
            f"/Users/qdouzery/Desktop/colibri/data/alpine_skiing/df_ranking-overall_{year}_{sex}.parquet"
        )
        data.append(df_ranking_overall_year)
    df_ranking_overall_current_past = pl.concat(data)
    list_athletes_current = np.unique(df_ranking_overall_current_past["name"].to_list())

    ##Iterate through athletes
    data = []
    for name_athlete in list_athletes_current:
        ##Load athlete results and add their name to them
        df_results_alltime_athlete = pl.scan_parquet(
            f"/Users/qdouzery/Desktop/colibri/data/alpine_skiing/results_wc/df_results-wc_alltime_{name_athlete.replace('_', '-')}.parquet"
        )
        df_results_alltime_athlete = df_results_alltime_athlete.with_columns(
            pl.lit(name_athlete).alias("name")
        )

        ##Aggregate data
        data.append(df_results_alltime_athlete)

    ##Create a dataframe with all athletes results
    df_results_alltime_athletes_current = pl.concat(data).collect()

    return df_results_alltime_athletes_current


def compute_statistics_race(df_results_race):
    """
    On a given dataframe of race results (ex: all DH results at Kitzbuehl), we compute for each athlete:
        - Number and percentage of wins
        - Number and percentage of podiums
        - Number and percentage of tops 10
        - Number of starts
    """

    ##Compute number of starts, wins, etc.
    df_statistics_race = df_results_race.group_by("name").agg(
        pl.col("name").len().alias("n_starts"),
        (pl.col("result") == 1).sum().alias("n_wins"),
        pl.col("result").is_between(1, 3).sum().alias("n_podiums"),
        pl.col("result").is_between(1, 10).sum().alias("n_tops_10"),
    )

    ##Compute percentage of wins, podiums, etc.
    df_statistics_race = df_statistics_race.with_columns(
        (100 * pl.col("n_wins") / pl.col("n_starts"))
        .round(0)
        .cast(pl.Int8)
        .alias("percentage_wins"),
        (100 * pl.col("n_podiums") / pl.col("n_starts"))
        .round(0)
        .cast(pl.Int8)
        .alias("percentage_podiums"),
        (100 * pl.col("n_tops_10") / pl.col("n_starts"))
        .round(0)
        .cast(pl.Int8)
        .alias("percentage_tops_10"),
    )

    ##Sort by number of tops 10
    df_statistics_race = df_statistics_race.sort(
        [
            "n_tops_10",
            "n_podiums",
            "n_wins",
            "percentage_tops_10",
            "percentage_podiums",
            "percentage_wins",
        ],
        descending=True,
    )

    return df_statistics_race


def bar_plot_statistics_race(df_statistics_race, place, discipline, results_range):
    """
    Bar plot to display the number of wins, tops 10, etc. of each athlete during a given race (ex: DH at Kitzbuehl)
    """

    ##Init. figure
    fig = figures.init_figure()

    ##Number of starts
    fig = figures.bar_plot(
        fig,
        df_statistics_race["name"]
        .str.replace_all("_", " ")
        .str.to_titlecase()
        .to_numpy(),
        df_statistics_race["n_starts"].to_numpy(),
        "Starts",
        "ghostwhite",
        "",
        "Number",
        f"Race statistics - {results_range}",
        subtitle=f"{(' & ').join(place)} - {(' & ').join(discipline)}",
        color_border="black",
    )

    ##Number and percentage of tops 10
    fig = figures.bar_plot(
        fig,
        df_statistics_race["name"]
        .str.replace_all("_", " ")
        .str.to_titlecase()
        .to_numpy(),
        df_statistics_race["n_tops_10"].to_numpy(),
        "Tops 10",
        figures.from_color_name_to_rgb_str("lightskyblue", opacity=1),
        "",
        "Number",
        f"Race statistics - {results_range}",
        subtitle=f"{(' & ').join(place)} - {(' & ').join(discipline)}",
        color_border="black",
        text=[
            f"{p}%" if p > 0 else ""
            for p in df_statistics_race["percentage_tops_10"].to_numpy()
        ],
        textposition="outside",
    )

    ##Number and percentage of podiums
    fig = figures.bar_plot(
        fig,
        df_statistics_race["name"]
        .str.replace_all("_", " ")
        .str.to_titlecase()
        .to_numpy(),
        df_statistics_race["n_podiums"].to_numpy(),
        "Podiums",
        figures.from_color_name_to_rgb_str("silver", opacity=1),
        "",
        "Number",
        f"Race statistics - {results_range}",
        subtitle=f"{(' & ').join(place)} - {(' & ').join(discipline)}",
        color_border="black",
        text=[
            f"{p}%" if p > 0 else ""
            for p in df_statistics_race["percentage_podiums"].to_numpy()
        ],
        textposition="outside",
    )

    ##Number and percentage of wins
    fig = figures.bar_plot(
        fig,
        df_statistics_race["name"]
        .str.replace_all("_", " ")
        .str.to_titlecase()
        .to_numpy(),
        df_statistics_race["n_wins"].to_numpy(),
        "Wins",
        figures.from_color_name_to_rgb_str("gold", opacity=1),
        "",
        "Number",
        f"Race statistics - {results_range}",
        subtitle=f"{(' & ').join(place)} - {(' & ').join(discipline)}",
        color_border="black",
        text=[
            f"{p}%" if p > 0 else ""
            for p in df_statistics_race["percentage_wins"].to_numpy()
        ],
        textposition="outside",
    )

    ##Overlay mode
    fig.update_layout(barmode="overlay")

    return fig

## Data

In [7]:
##Set athletes sex
sex = "f"

##Get results of all current athletes
df_results_alltime_athletes_current = get_results_alltime_athletes_current(sex=sex)

## Recent shape

In [8]:
##Set number of months on which we look at past results
n_months = 15
date_oldest = dates.change_date(datetime.today(), "past", n_days=n_months * 31)

##Set discipline(s) of interest
discipline = ["SG"]

##Extract recent results for given discipline(s)
df_results_discipline_athletes_current = df_results_alltime_athletes_current.filter(
    pl.col("discipline").is_in(discipline) & (pl.col("date") >= date_oldest)
)

##Create a column that contains place and date
df_results_discipline_athletes_current = (
    df_results_discipline_athletes_current.with_columns(
        pl.concat_str([pl.col("place"), pl.col("date")], separator=" | ").alias(
            "place_date"
        )
    )
)

In [9]:
##Init. figure
fig = figures.init_figure()

##Iterate through athletes
list_df_results_discipline_recent_athletes = (
    df_results_discipline_athletes_current.partition_by("name")
)
for df_results_discipline_recent_athlete in list_df_results_discipline_recent_athletes:
    ##Sort results
    df_results_discipline_recent_athlete = df_results_discipline_recent_athlete.sort(
        "date"
    )

    ##Scatter plot of results
    fig = figures.scatter_plot(
        fig,
        df_results_discipline_recent_athlete["place_date"].to_numpy(),
        df_results_discipline_recent_athlete["result"].to_numpy(),
        df_results_discipline_recent_athlete["bib"].to_numpy(),
        "markers+lines",
        df_results_discipline_recent_athlete["name"]
        .str.replace_all("_", " ")
        .item(0)
        .title(),
        "black",
        "",
        "Result",
        "Last results",
        subtitle=f"{(' & ').join(discipline)}",
        yrange=[60, 0],
        marker_size=10,
        marker_symbol="circle",
        marker_contour_color="ghostwhite",
        line_dash="solid",
    )

##Add lines for wins, podiums, tops 10
fig.add_hline(y=1, line_color="gold", line_dash="dot", line_width=2)
fig.add_hline(y=3, line_color="silver", line_dash="dot", line_width=2)
fig.add_hline(y=10, line_color="lightskyblue", line_dash="dot", line_width=2)

##Hide traces initially
fig.update_traces(visible="legendonly")

##Display figure
fig.show()

## Race statistics

In [10]:
##Set race place and discipline(s)
place = ["Cortina"]
discipline = ["SG"]

##Extract results for given place and discipline(s)
df_results_race_athletes_current = df_results_alltime_athletes_current.filter(
    pl.col("place").is_in(place) & pl.col("discipline").is_in(discipline)
)

### All time

In [11]:
##Compute statistics on given race
df_statistics_race_alltime = compute_statistics_race(df_results_race_athletes_current)

##Keep only athletes with min. 1 top 10 or many starts
n_starts_median = df_statistics_race_alltime["n_starts"].median()
aux_df_statistics_race_alltime = df_statistics_race_alltime.filter(
    (pl.col("n_tops_10") >= 1) | (pl.col("n_starts") > n_starts_median)
)

##Statistics chart
fig = bar_plot_statistics_race(
    aux_df_statistics_race_alltime, place, discipline, "All time"
)
fig.show()

### Recent years

In [12]:
##Extract only results on recent years
n_years = 5
date_oldest = dates.change_date(datetime.today(), "past", n_days=n_years * 365 + 100)

##Compute statistics on given race
df_statistics_race_recent = compute_statistics_race(
    df_results_race_athletes_current.filter(pl.col("date") >= date_oldest)
)

##Keep only athletes with min. 1 top 10
aux_df_statistics_race_recent = df_statistics_race_recent.filter(
    (pl.col("n_tops_10") >= 1)
)

##Statistics chart
fig = bar_plot_statistics_race(
    aux_df_statistics_race_recent, place, discipline, f"Last {n_years} years"
)
fig.show()

## EDT

In [4]:
import os
import time
import polars as pl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Load all results

In [None]:
# Configuration du WebDriver
driver = webdriver.Chrome()
driver.get("https://resultscui.active.com/events/L%C3%89tapeduTourdeFrance2024")

##
button_cookies = driver.find_element(
    By.CSS_SELECTOR, "#onetrust-close-btn-container > button"
)
button_cookies.click()

try:
    # Charger les résultats en cliquant sur le bouton "Charger plus"
    while True:
        # Attendre un moment pour charger les nouveaux résultats
        time.sleep(2)

        # Rechercher et cliquer sur le bouton "Charger plus"
        try:
            load_more_button = driver.find_element(
                By.CSS_SELECTOR,
                "#root > div.app > div.app__content > div.page-body > div > div > div > div > div > div:nth-child(2) > div > div.view-more-list > div.view-more-list__footer > a",
            )
            load_more_button.click()
        except NoSuchElementException:
            print("Tous les résultats sont chargés.")
            break

except Exception as e:
    print(f"Erreur rencontrée : {e}")

Tous les résultats sont chargés.


### Get all results

**Webscraping**

In [4]:
# Sélecteur CSS générique pour tous les athlètes
athletes = driver.find_elements(
    By.CSS_SELECTOR,
    "#root > div.app > div.app__content > div.page-body > div > div > div > div > div > div:nth-child(2) > div > div.view-more-list > div:nth-child(1) > div",
)

# Ajouter les informations de chaque athlète à la liste des résultats
data = []
for athlete in athletes:
    ##
    rank = athlete.find_element(By.CLASS_NAME, "event-home__rank").text
    aux_bib = athlete.find_element(By.CLASS_NAME, "event-home__bib")
    bib = aux_bib.find_element(By.CLASS_NAME, "event-home__result").text
    url = athlete.find_element(By.TAG_NAME, "a").get_attribute("href")
    name = athlete.find_element(By.TAG_NAME, "a").text
    aux_name = athlete.find_element(By.CLASS_NAME, "event-home__person")
    info = aux_name.find_element(By.CLASS_NAME, "event-home__info").text
    time_race = athlete.find_element(By.CLASS_NAME, "event-home__finish").text

    ##
    data.append([rank, name, time_race, bib, info, url])

##
df = pl.DataFrame(data, schema=["result", "name", "time", "bib", "info", "url"])

##
driver.quit()

**Preprocessing**

In [8]:
##
df_pp = df.with_columns(
    pl.col("result").cast(pl.Int16),
    pl.col("name").str.to_lowercase().str.replace_all(" ", "_"),
    pl.col("time").str.strip_chars("\nArrivée").str.to_time("%H:%M:%S"),
    pl.col("bib").cast(pl.Int16),
    pl.col("info").str.split(" | Âge "),
)

##
df_pp = df_pp.with_columns(
    pl.col("info").list.get(0).alias("sex"),
    pl.col("info").list.get(1).cast(pl.Int8).alias("age"),
).drop(["info"])

## replace " "  by "_" again ?

# ##
# df_pp.write_parquet(
#     "/Users/qdouzery/Desktop/df_results_edt-2024.parquet"
# )

### Get athletes details

In [27]:
df_results = pl.read_parquet(
    "/Users/qdouzery/Desktop/edt-2024_results/df_results_edt-2024.parquet"
)

In [52]:
def download_splits_athlete(driver, athlete_name, url, save=False):
    """
    Bla
    """

    ##Go to athlete detailed results webpage
    driver.get(url)

    ##Find detailed results
    aux_results_details = WebDriverWait(driver, 15).until(
        EC.presence_of_element_located(
            (
                By.CSS_SELECTOR,
                "#root > div.app > div.app__content > div.page-body > div > div > div > div > div > div > ul",
            )
        )
    )
    results_details = aux_results_details.find_elements(By.TAG_NAME, "li")

    ##Iterate through each splits
    data = []
    for split in results_details:
        list_info = split.text.split("\n")
        name = list_info[0]

        ##Check if it is a split (and not the start - with no info)
        if name != "Start":
            ##Get split time and total time at split
            time_split = list_info[1]
            time_total_split = list_info[3]

            ##Aggregate data
            data.append([athlete_name, name, time_split, time_total_split])

    ##Create a dataframe
    df_splits = pl.DataFrame(
        data, schema=["athlete_name", "split", "time_split", "time_total"], orient="row"
    )
    print(f"Splits of {athlete_name} downloaded.")

    ##Preprocess dataframe
    df_splits_pp = df_splits.with_columns(
        pl.when(pl.col("time_split") == "--")
        .then(None)
        .otherwise(pl.col("time_split"))
        .alias("time_split"),
        pl.when(pl.col("time_total") == "--")
        .then(None)
        .otherwise(pl.col("time_total"))
        .alias("time_total"),
    )
    df_splits_pp = df_splits_pp.with_columns(
        pl.col("time_split").str.to_time("%H:%M:%S"),
        pl.col("time_total").str.to_time("%H:%M:%S"),
    )

    ##Save dataframe
    if save:
        df_splits_pp.write_parquet(
            f"/Users/qdouzery/Desktop/edt-2024_results/splits/df_splits_{athlete_name.replace('_', '-')}.parquet"
        )

    return df_splits_pp

In [56]:
##
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

##Go to athlete detailed results webpage
driver = webdriver.Chrome(options=options)
driver.get(
    "https://resultscui.active.com/participants/46156532?pn=7a807c21466b7cf879dd60d4d73c6ef6a1eb2730967ba60d625d393eb1f6d1b1"
)

##Close cookies
button_cookies = driver.find_element(
    By.CSS_SELECTOR, "#onetrust-close-btn-container > button"
)
button_cookies.click()

##Iterate through athletes
for r in range(len(df_results)):
    ##Get athlete name and url of splits
    athlete_name = df_results["name"].item(r)
    url = df_results["url"].item(r)

    ##Check we did not have downloaed athlete's detailed results yet
    if not (
        os.path.isfile(
            f"/Users/qdouzery/Desktop/edt-2024_results/splits/df_splits_{athlete_name.replace('_', '-')}.parquet"
        )
    ):
        ##
        df_splits_pp = download_splits_athlete(driver, athlete_name, url, save=True)

##Quit driver
driver.quit()

Splits of didier_mazel downloaded.
Splits of bastien_melotto downloaded.
Splits of julien_teyssier downloaded.
Splits of stephen_mcelwee downloaded.
Splits of raphaele_thevenin downloaded.
Splits of jon_orchard downloaded.
Splits of philipp_wittenhorst downloaded.
Splits of alain_jamain downloaded.
Splits of alexis_picot downloaded.
Splits of philippe_emonin downloaded.
Splits of emmanuel_foucher downloaded.
Splits of martin_meyer downloaded.
Splits of vincent_ligier downloaded.
Splits of gregoire_charpentier downloaded.
Splits of aymeric_barbarot downloaded.
Splits of alexandre_voigt downloaded.
Splits of philip_hu downloaded.
Splits of herve_glenisson downloaded.
Splits of estelle_le downloaded.
Splits of karina_kase downloaded.
Splits of johan_colliat downloaded.
Splits of gareth_griffiths downloaded.
Splits of bernard_husson downloaded.
Splits of david_krej?i downloaded.
Splits of peter_rutherford downloaded.
Splits of alexandre_boileau downloaded.
Splits of jennifer_blin downloade

# www.robotstxt.org/
# www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449

User-agent: *
Disallow: /admin
Disallow: /sonatadmin
Disallow: /api
Disallow: /graphql
Disallow: /login
Disallow: /*/ajax
Disallow: /*/block
Disallow: /*/login
Disallow: /*/sitemap