# Compute embeddings from race ranking data

Here's the next step! The algorithm finds hidden factors (called embeddings) that summarize a racer's and a race's profile. Inspired from this [Kaggle notebook](https://www.kaggle.com/code/sborms/collaborative-filtering-deep-dive).

A script version of part of this notebook is in `scripts/train.py`.

In [None]:
print(f"\33[1m\33[33mLet's go\33[0m!")

## Imports

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fastai.collab import *
from fastai.tabular.all import *
from torch import nn

sys.path.append("../")
from src.utils import *

## Functions

## Config

In [None]:
CURR_YEAR = 2023
BS = 64  # batch size
NORMALIZE_HOW = "bins"  # "0-1", "1-20", "bins"
MIN_N_PARTICIPATIONS = 20  # a rider is considered only if they did at least this amount of race participations
N_FACTORS = 10  # number of hidden factors
Y_RANGE = (0, 5.25 * 2)  # (0, 1) or (1, 20.5) or (0, 5.25), multiply by max. of race class weighting

## Compute embeddings

In [None]:
df_results = pd.read_csv("../data/df_race_results.csv",
                         index_col=[0, 1, 2],
                         dtype={"year": str, "stage_slug": str, "class": str})

In [None]:
df_results = df_results[df_results.columns[df_results.count(axis=0) >= MIN_N_PARTICIPATIONS]]

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().head()

In [None]:
df_results = normalize_results_by_race(df_results, how=NORMALIZE_HOW)
df_results = df_results.astype(float)

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().head()

In [None]:
df_reweight = df_results.index.to_frame().reset_index(drop=True)

In [None]:
df_reweight["w_year"] = df_reweight["year"].astype(int).apply(get_year_weight)
df_reweight["w_class"] = df_reweight["class"].str.partition(".")[2].apply(get_race_class_weight)
df_reweight["w_stage"] = df_reweight["stage_slug"].str.contains("/stage-").apply(get_stage_weight)
df_reweight["w_gc"] = ((df_reweight["class"].str.contains("2")) & (df_reweight["stage_slug"].str.endswith("/"))).apply(get_gc_weight)
df_reweight["w"] = df_reweight["w_year"] * df_reweight["w_class"] * df_reweight["w_stage"] * df_reweight["w_gc"]
df_reweight.set_index(["year", "stage_slug", "class"], inplace=True)

In [None]:
print(sorted(np.round(df_reweight["w"].unique(), 2)))

In [None]:
# scale race results by weights
df_results.loc[:, :] = df_results.to_numpy() * df_reweight[["w"]].to_numpy()

In [None]:
df_results.filter(regex="VAN AERT Wout").dropna().sample(5)

In [None]:
df = pd.melt(df_results.reset_index(drop=False).drop(columns=["year", "class"]), id_vars="stage_slug")
df.rename(columns={"stage_slug": "stage", "variable": "rider", "value": "result"}, inplace=True)
df = df[["rider", "stage", "result"]]  # rider = user, stage (race) = item, result = rating

In [None]:
df_ = df.dropna()
df_.rider.nunique(), df_.stage.nunique()

In [None]:
dls = CollabDataLoaders.from_df(df_, bs=BS)
dls.show_batch()

In [None]:
learn = collab_learner(dls, n_factors=N_FACTORS, y_range=Y_RANGE)

In [None]:
lrs = learn.lr_find(suggest_funcs=(minimum, steep, valley, slide))

In [None]:
learn.fit_one_cycle(5, lrs.valley, wd=0.1)

In [None]:
learn.model

In [None]:
learn.export("../data/learner.pkl")

## Interpretation

### Bias

In [None]:
rider_bias = extract_bias(learn, "rider")
[dls.classes["rider"][i] for i in rider_bias.argsort(descending=True)[:10]]  # "best"/most consistent riders across all seasons

In [None]:
race_bias = extract_bias(learn, "stage")
[dls.classes["stage"][i] for i in race_bias.argsort(descending=True)[:10]]

### PCA

In [None]:
plot_pca(df, learn, "rider", n_plot=20)

In [None]:
# plot_pca(df, learn, "stage", n_plot=10)

### Similarity

In [None]:
spotcheck_riders = ["VAN AERT Wout", "VAN DER POEL Mathieu", "VAN AVERMAET Greg", "ALAPHILIPPE Julian", "POGAÄŒAR Tadej", "EVENEPOEL Remco", "MAS Enric"]
for r in spotcheck_riders:
    print(r.ljust(20), "|", extract_most_similar_elements(learn, "rider", r, 7))

In [None]:
spotcheck_stages = ["paris-roubaix/2022/result", "dauphine/2022/stage-6/result", "liege-bastogne-liege/2022/result"]
for r in spotcheck_stages:
    print(r, "\n", extract_most_similar_elements(learn, "stage", r, 10), "\n", sep="")