# Gathering data from procyclingstats.com

This notebook scrapes the input data from [procyclingstats.com](https://www.procyclingstats.com/) using the [**procyclingsstats**](https://github.com/themm1/procyclingstats) scraping library. I add some high-level cleaning and assembling functionality on top to make the scraping easier.

It collects:
- For a large number of riders from the best teams...
- Metadata for each rider, but most importantly...
- Their results in one-day or multi-stage...
- High-level races...
- For up to a few years in the past

The data is transformed into a simple matrix (pandas DataFrame) format, so that it can be used in the next step's algorithm to find hidden factors (called embeddings) determining a racer's and a race's profile. All having to specify very little about the type of race! Ready, set, go!

A script version of this notebook is in `scripts/scrape.py`.

## Imports

In [None]:
import re
import numpy as np
import pandas as pd
from unidecode import unidecode
from sklearn.feature_extraction import DictVectorizer
from procyclingstats import (
    Race,          # Race("race/tour-de-france/2022/overview").parse()
    Rider,         # Rider("rider/tadej-pogacar").parse()
    Stage,         # Stage("race/tour-de-france/2018/stage-18").parse()
    Team,          # Team("team/bora-hansgrohe-2021").parse()
    RiderResults,  # RiderResults("rider/alberto-contador/results").parse()
    RaceStartlist,
    RaceClimbs,
    Ranking        # Ranking("rankings/me/individual").parse() --> Summation of PCS points over a 12-month + 2 weeks overlap period
)

The scraping classes I focus on are: `Race`, `Rider`, `Stage`, and `Team`.

## Functions

In [None]:
def try_to_parse(obj, slug, printit=False):
    if printit:
        print(f"Parsing > {slug} ...")
    
    parsed = None  # fallback
    try:
        parsed = obj(slug).parse()
    except:
        print(f"Oopsie! This one failed: {slug}")
    return parsed

def parse_results_from_stage(slug, parsed):
    results = None  # fallback
    if parsed is not None:
        try:
            rid = {"gc": "gc", "result": "results"}[slug.split("/")[-1]]
        except KeyError:
            return results
        if parsed[rid] is not None:
            results = [(r["rider_name"], r["rank"]) for r in parsed[rid]]  # e.g. [(WVA, 1), (MVDP, 2), (Pogiboy, 3), ...]
    return results

def clean_rider_name(name):
    return re.sub(r"\s+", " ", name.replace("\t", ""))

def convert_name_to_slug(name):
    """Convert input from 'FAMILY NAME First Name' to 'first-name-family-name'."""
    # these manual conversions are of important riders
    # with a different slug than their used name
    dict_manual_conversions = {
        "CORT Magnus": "magnus-cort-nielsen",
        "AYUSO Juan": "juan-ayuso-pesquera",
        "FROOME Chris": "christopher-froome",
        "DUNBAR Eddie": "edward-irl-dunbar",
        "RODRÍGUEZ Carlos": "carlos-rodriguez-cano",
        "BARTA Will": "william-barta",
        "HONORÉ Mikkel Frølich": "mikkel-honore",
        "HERRADA Jesús": "jesus-herrada-lopez",
        "GROßSCHARTNER Felix": "felix-grossschartner",
        "DREßLER Luca": "luca-dressler",
        "BUITRAGO Santiago": "santiago-buitrago-sanchez",
        "CHAVES Esteban": "johan-esteban-chaves",
        "MÜLLER Tobias": "tobias-muller1",
        "RÜEGG Timon": "timin-ruegg",
        "SCULLY Tom": "thomas-scully",
        "SKJELMOSE Mattias": "mattias-skjelmose-jensen",
        "VALGREN Michael": "michael-valgren-andersen",
        "VINGEGAARD Jonas": "jonas-vingegaard-rasmussen",
        "WRIGHT Fred": "alfred-wright"
    }
    if name in dict_manual_conversions.keys():
        return dict_manual_conversions[name]

    slug = "-".join([_.lower() for _ in name.split(" ") if not _.isupper()] +
                    [_.lower() for _ in name.split(" ") if _.isupper()])
    
    slug = slug.replace("--", "-")
    slug = slug.replace("'", "-")
    slug = unidecode(slug)

    return slug

## Config

In [None]:
YEARS = [2022, 2023]

I use the 2023 races as base calendar, inluding only UCI Worldtour, UCI ProSeries, and Europe Tour races. Of course, races (and race names) change over the years but not so much. U23 (xU) and championships (NN/CC) races are dropped. I also had to remove a few duplicates. The idea is that we deduce the most important riders based on who participated in these races. Doing the inverse seems less straightforward with the API package.

In [None]:
CUTOFFDATE = "2023-04-30"
print(CUTOFFDATE)

In [None]:
df_races = pd.read_csv("../data/races.csv", delimiter=";", encoding="latin-1")
df_races = df_races.dropna()

In [None]:
df_races.Class.unique().tolist()  # 1.x = one-day race, 2.x = multi-day race & .UWT > .Pro > .1 > .2

## Parse results

In [None]:
df_races_out_list = []
for year in YEARS:
    races, classes, stages = [], [], []
    for i, row in df_races.iterrows():
        race_key, _, race_class, race_slug = row
        
        race_slug_full = f"race/{race_slug}/{year}/overview"
        race_p = try_to_parse(Race, race_slug_full)     
        if race_p is None:
            continue
        else:
            # do not process if race end date is beyond dataset cutoff date
            # but keep going, because races are not ordered chronologically
            if race_p["enddate"] > CUTOFFDATE:
                continue
            
            stage_slug_base = race_slug_full.replace("/overview", "")  # has general classification if multi-stage race
            if race_p["is_one_day_race"] is True:
                stage_slugs = [f"{stage_slug_base}/result"]  # one-day race
            elif "stages" in race_p:
                stage_slugs = [f"{stage_slug_base}/gc"] + [f"{s['stage_url']}/result" for s in race_p["stages"]]  # multiple stages
            
            races += [race_key] * len(stage_slugs)
            classes += [race_class] * len(stage_slugs)
            stages += stage_slugs
    
    df_races_out_list.append(pd.DataFrame({"year": year, "race": races, "class": classes, "stage_slug": stages}))
        
df_races_out = pd.concat(df_races_out_list)

In [None]:
print(len(df_races_out))
df_races_out.head(15)

In [None]:
df_races_out["parsed"] = df_races_out["stage_slug"].apply(lambda x: try_to_parse(Stage, x))

In [None]:
# handy to keep track of issues
stages_not_parsed = df_races_out[df_races_out.parsed.isnull()]["stage_slug"].tolist()
print(f"{len(stages_not_parsed)} out of {len(df_races_out)} race results were not parsed")

In [None]:
df_races_out.dropna(subset=["parsed"], inplace=True)  # drop stages that couldn't be parsed

In [None]:
df_races_out["results"] = df_races_out[["stage_slug", "parsed"]].apply(lambda x: parse_results_from_stage(*x), axis=1)

In [None]:
df_races_out.shape

In [None]:
df_races_out["parsed"].iloc[0]["race_startlist_quality_score"]

In [None]:
df_races_out["parsed"].iloc[0].keys()

In [None]:
vec = DictVectorizer()

measurements = df_races_out["results"].apply(lambda x: {} if x is None else dict(x))
df_results = pd.DataFrame(
    vec.fit_transform(measurements).toarray(),
    columns=vec.get_feature_names_out(),
    # set year, stage slug, and class as indices
    index=pd.MultiIndex.from_frame(pd.concat([df_races_out["year"],
                                              df_races_out["stage_slug"].str.replace("race/", ""),
                                              df_races_out["class"]],
                                             axis=1))
)

df_results.replace(0, np.nan, inplace=True)  # initially NaN = did not finish race, 0 = did not participate; this replace() drops distinction

In [None]:
df_results.sample(5)

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().loc[2022]

In [None]:
print(df_results.shape)
df_results = df_results.dropna(axis=0, how="all")  # drop races that were cancelled or couldn't be parsed
print(df_results.shape)

In [None]:
df_results.columns = [clean_rider_name(c) for c in df_results.columns]

## Parse riders data

In [None]:
riders_all = sorted(df_results.columns)

birth_dates, nationalities = [], []
for rider_name in riders_all:
    rider_slug = convert_name_to_slug(rider_name)
    try:
        rider = Rider(f"rider/{rider_slug}")
        birth_dates.append(rider.birthdate())
        nationalities.append(rider.nationality())
    except (ValueError, AttributeError):
        print(f"Damn! Rider not found: {rider_name} --> {rider_slug}")
        birth_dates.append(None)
        nationalities.append(None)
        continue

In [None]:
df_riders = pd.DataFrame({
    "name": riders_all,
    "birth_date": birth_dates,
    "nationality": nationalities
})

In [None]:
df_riders

In [None]:
print(df_riders.shape)
df_riders.dropna(inplace=True)
print(df_riders.shape)

## Merge and store data

In [None]:
df_results = df_results[[r for r in df_results.columns if r in df_riders.name.tolist()]]

In [None]:
df_riders.shape[0], df_results.shape[1]

In [None]:
df_riders.to_csv("../data/riders_data.csv", index=False)

In [None]:
df_results.to_csv("../data/matrix_race_results.csv", index=True)