# Compute embeddings from race ranking data

Here's the next step! The algorithm finds hidden factors (called embeddings) that summarize a racer's and a race's profile. Inspired from this [Kaggle notebook](https://www.kaggle.com/code/sborms/collaborative-filtering-deep-dive).

A script version of part of this notebook is in `scripts/train.py`.

In [None]:
print(f"\33[1m\33[33mLet's go\33[0m!")

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastai.collab import *
from fastai.tabular.all import *
from torch import nn

## Functions

In [None]:
def normalize_results_by_race(df, how):
    if how == "0-1":
        return df.rank(axis=1, pct=True, ascending=False, na_option="keep")  # 1.0 means first, 0.0 means last in race
    if how == "1-20":
        return df.clip(upper=20)  # logic is inversed here: higher values indicates lower performance
    if how == "bins":
        return df.apply(lambda x: pd.cut(x,
                                         bins=[1, 3, 5, 10, 20, 200],  # podium, top-5, top-10, top-20, not in contention
                                         labels=[5, 4, 3, 2, 1],  # from best to worse race result, NaN is not participated/finished
                                         include_lowest=True
                                        )
                       )

def get_year_weight(year, decay=0.25):
    """Give more weight to current and more recent years."""  # bias seems to be impacted by how long riders are active
    return np.exp(-decay * (CURR_YEAR - year))  # if decay factor is set higher, earlier years receive less weight

def get_race_class_weight(race_class):
    """Give more weight to most important races."""
    return {"UWT": 2, "Pro": 1.5, "1": 0.75, "2": 0.5}[race_class]

def get_stage_weight(stage: bool):
    """Give less weight to stages from a multi-stage race."""
    return 0.8 if stage is True else 1

def get_gc_weight(gc: bool):
    """Give more weight to general classification outcomes."""
    return 1.25 if gc is True else 1

def extract_factors(learn, dim):
    return learn.model.u_weight.weight if dim == "rider" else learn.model.i_weight.weight if dim == "stage" else None

def extract_bias(learn, dim):
    return learn.model.u_bias.weight.squeeze() if dim == "rider" else learn.model.i_bias.weight.squeeze() if dim == "stage" else None

def extract_most_similar_elements(learn, dim="rider", element="VAN AERT Wout", n=20):
    assert dim in ["rider", "stage"], "Dimension should be 'rider' or 'stage'."
    factors = extract_factors(learn, dim)
    idx = learn.dls.classes[dim].o2i[element]
    sim = nn.CosineSimilarity(dim=1)(factors, factors[idx][None])
    # pd.Series(sim.detach()).sort_values(ascending=False).reset_index(drop=True).plot()  # twisted S-shape
    idx_topn = sim.argsort(descending=True)[1:(n+1)]
    return learn.dls.classes[dim][idx_topn]

def plot_pca(df, learn, dim, n_plot=50):
    g = df.groupby(dim)["result"].count()
    top_dim = g.sort_values(ascending=False).index.values[:]  # takes riders with most races, or races with most participants
    top_idxs = tensor([learn.dls.classes[dim].o2i[m] for m in top_dim])

    factors = extract_factors(learn, dim)
    w = factors[top_idxs].cpu().detach()

    pca = w.pca(3)
    fac0, fac1, fac2 = pca.t()
    idxs = list(range(n_plot))
    X, Y = fac0[idxs], fac2[idxs]

    plt.figure(figsize=(7, 7))
    plt.scatter(X, Y)
    for i, x, y in zip(top_dim[idxs], X, Y):
        plt.text(x, y, i, color=np.random.rand(3)*0.7, fontsize=9)
    plt.show()

## Config

In [None]:
BS = 64  # batch size
NORMALIZE_HOW = "bins"  # "0-1", "1-20", "bins"
MIN_N_PARTICIPATIONS = 20  # a rider is considered only if they did at least this amount of race participations
N_FACTORS = 10  # number of hidden factors
Y_RANGE = (0, 5.25 * 2)  # (0, 1) or (1, 20.5) or (0, 5.25), multiply by max. of race class weighting
CURR_YEAR = 2023

## Compute embeddings

In [None]:
df_results = pd.read_csv("../data/matrix_race_results.csv",
                         index_col=[0, 1, 2],
                         dtype={"year": str, "stage_slug": str, "class": str})

In [None]:
df_results = df_results[df_results.columns[df_results.count(axis=0) >= MIN_N_PARTICIPATIONS]]
df_results.columns = df_results.columns.str.strip()  # some columns have trailing whitespaces

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().head()

In [None]:
df_results = normalize_results_by_race(df_results, how=NORMALIZE_HOW)
df_results = df_results.astype(float)

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().head()

In [None]:
df_reweight = df_results.index.to_frame().reset_index(drop=True)

In [None]:
df_reweight["w_year"] = df_reweight["year"].astype(int).apply(get_year_weight)
df_reweight["w_class"] = df_reweight["class"].str.partition(".")[2].apply(get_race_class_weight)
df_reweight["w_stage"] = df_reweight["stage_slug"].str.contains("/stage-").apply(get_stage_weight)
df_reweight["w_gc"] = ((df_reweight["class"].str.contains("2")) & (df_reweight["stage_slug"].str.endswith("/"))).apply(get_gc_weight)
df_reweight["w"] = df_reweight["w_year"] * df_reweight["w_class"] * df_reweight["w_stage"] * df_reweight["w_gc"]
df_reweight.set_index(["year", "stage_slug", "class"], inplace=True)

In [None]:
print(sorted(np.round(df_reweight["w"].unique(), 2)))

In [None]:
# scale race results by weights
df_results.loc[:, :] = df_results.to_numpy() * df_reweight[["w"]].to_numpy()

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().sample(5)

In [None]:
df = pd.melt(df_results.reset_index(drop=False).drop(columns=["year", "class"]), id_vars="stage_slug")
df.rename(columns={"stage_slug": "stage", "variable": "rider", "value": "result"}, inplace=True)
df = df[["rider", "stage", "result"]]  # rider = user, stage (race) = item, result = rating

In [None]:
df_ = df.dropna()
df_.rider.nunique(), df_.stage.nunique()

In [None]:
dls = CollabDataLoaders.from_df(df_, bs=BS)
dls.show_batch()

In [None]:
learn = collab_learner(dls, n_factors=N_FACTORS, y_range=Y_RANGE)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(3, 0.05, wd=0.1)

In [None]:
learn.model

In [None]:
learn.export("../data/learner.pkl")

## Interpretation

### Bias

In [None]:
rider_bias = extract_bias(learn, "rider")
[dls.classes["rider"][i] for i in rider_bias.argsort(descending=True)[:10]]  # "best"/most consistent riders across all seasons

In [None]:
race_bias = extract_bias(learn, "stage")
[dls.classes["stage"][i] for i in race_bias.argsort(descending=True)[:10]]

### PCA

In [None]:
plot_pca(df, learn, "rider", n_plot=20)

In [None]:
# plot_pca(df, learn, "stage", n_plot=10)

### Similarity

In [None]:
spotcheck_riders = ["VAN AERT Wout", "VAN DER POEL Mathieu", "VAN AVERMAET Greg", "ALAPHILIPPE Julian", "POGAČAR Tadej", "EVENEPOEL Remco", "MAS Enric"]
for r in spotcheck_riders:
    print(r.ljust(20), "|", extract_most_similar_elements(learn, "rider", r, 7))

In [None]:
spotcheck_stages = ["paris-roubaix/2022/", "dauphine/2019/stage-6", "liege-bastogne-liege/2019"]
for r in spotcheck_stages:
    print(r, "\n", extract_most_similar_elements(learn, "stage", r, 10), "\n", sep="")