In [None]:
%load_ext jupyter_black

In [None]:
import os
import time
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn import datasets
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor


import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Case study: CatBoost for Ranking

### Ranking problem

* Let $Q = \{q_1, \dots, q_n\}$ be the set of queries
* $D_q = \{d_{q1}, \dots, d_{qm}\}$ -- set of objects retrieved for a group $q$
* $L_q = \{l_{q1}, \dots, l_{qm}\}$ -- relevance labels for the objects from the set $D_q$


### Ranking quality metrics:
* __Precision__
    $$ \mbox{P}=\frac{|\{\mbox{relevant docs}\}\cap\{\mbox{retrieved docs}\}|}{|\{\mbox{retrieved docs}\}|} $$
* __Recall__
    $$ \mbox{R}=\frac{|\{\mbox{relevant docs}\}\cap\{\mbox{retrieved docs}\}|}{|\{\mbox{relevant docs}\}|} $$
    
    Notation $@k$ means that metric is calculated on the first $k$ documents from ranking list.

    For example, if 1,2,5,7,9 is the ranks of relevant documents (enumerations starts from number 1) from the retrivied then $P@5$ will be $\frac{3}{5}$.

* __Mean average precision (MAP)__
    $$\frac{1}{|Q|}\sum_{q \in Q} \frac{1}{|\mbox{relevant docs in } D_q|} \sum_{k} P@k(q) \times rel(q, k) $$
    
    Where $rel(q, k)$ is a relevance label of the document at k-th position in our ranking of $D_q$. This metric calculates average precision for a query weighted with document relevances and then calculate mean between all queries.
    
* __Discounted cumulative gain (DCG)__
    $$\sum_{k=1}^{mq} \frac{2 ^ {l_{qk}}}{\log_2(k+1)}$$
    
    This metric takes into account user behavior: user attention is high on the top and then nonlinear decrease to the end.
    
* __NDCG__ - normalized DCG = DCG $~ / ~$ IDCG, where IDCG is a maximum possible value of DCG with given set of relevance labels.

* __AverageGain__ - represents the average value of the label values for objects with the defined top  label values.
   
More on wiki: https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)

Parameter $@k$ for every metric can be specified through metric parameter "top", for example "NDCG:top=10", would mean NDCG@10.

### Download part of [MSRank](https://www.microsoft.com/en-us/research/project/mslr/) dataset from CatBoost datasets storage

In [None]:
from catboost.datasets import msrank_10k

train_df, test_df = msrank_10k()

X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
queries_train = train_df[1].values

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
queries_test = test_df[1].values

In [None]:
X_train.shape

In [None]:
y_train

In [None]:
queries_train

In [None]:
print(X_train.shape[0], X_train.shape[1], np.unique(queries_train).shape[0])

In [None]:
from collections import Counter

Counter(y_train).items()

### Notmalize to [0,1]

In [None]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

### Creation of CatBoost pools

In [None]:
train = cb.Pool(data=X_train, label=y_train, group_id=queries_train)

test = cb.Pool(data=X_test, label=y_test, group_id=queries_test)

### You can also create pools from files

In [None]:
data_dir = "./msrank"

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

train_file = os.path.join(data_dir, "train.csv")
test_file = os.path.join(data_dir, "test.csv")

train_df.to_csv(train_file, index=False, header=False)
test_df.to_csv(test_file, index=False, header=False)

In [None]:
description_file = os.path.join(data_dir, "dataset.cd")
with open(description_file, "w") as f:
    f.write("0\tLabel\n")
    f.write("1\tQueryId\n")

In [None]:
cb.Pool(data=train_file, column_description=description_file, delimiter=",")

### <span style="color:#ce2029">Attention:</span> all objects in dataset must be sorted by group_id

For example, if the dataset consits of five documents 
\[d1, d2, d3, d4, d5\] with corresponding queries \[q1, q2, q2, q1, q2\] then the dataset should be look like:

$$\begin{pmatrix}
    d_1, q_1, f_1\\
    d_4, q_1, f_4\\
    d_2, q_2, f_2\\
    d_3, q_2, f_3\\
    d_5, q_2, f_5\\
\end{pmatrix} \hspace{6px} \texttt{or} \hspace{6px}
\begin{pmatrix}
    d_2, q_2, f_2\\
    d_3, q_2, f_3\\
    d_5, q_2, f_5\\
    d_1, q_1, f_1\\
    d_4, q_1, f_4\\
\end{pmatrix}$$

where $f_i$ is feature vector of i-th document.

### RMSE

The first and simplest idea is to try predicting document relevance $l_q$ minimizing RMSE.

$$\frac{1}{N}\sqrt{ \sum_q \sum_{d_{qk}} \left(f(d_{qk}) - l_{qk} \right)^2 }$$

In [None]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = {
        "iterations": 2000,
        "custom_metric": [
            "NDCG",
            "MAP:top=10",
            "PrecisionAt:top=1",
            "PrecisionAt:top=10",
        ],
        "verbose": False,
        "random_seed": 0,
    }
    parameters["loss_function"] = loss_function
    parameters["train_dir"] = loss_function

    if additional_params is not None:
        parameters.update(additional_params)

    model = cb.CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)

    return model

Lets train the simplest model and also demonstrate precision/recall metrics from introduction.

In [None]:
model = fit_model("RMSE")

### Group weights parameter


In [None]:
def create_weights(queries):
    queries_set = np.unique(queries)
    queries_weight = np.random.uniform(size=queries_set.shape[0])
    weights = np.zeros_like(queries, dtype=float)
    for i, query_id in enumerate(queries_set):
        weights[queries == query_id] = queries_weight[i]
    return weights


create_weights(queries_train)

In [None]:
train_with_weights = cb.Pool(
    data=X_train,
    label=y_train,
    group_weight=create_weights(queries_train),
    group_id=queries_train,
)

test_with_weights = cb.Pool(
    data=X_test,
    label=y_test,
    group_weight=create_weights(queries_test),
    group_id=queries_test,
)

fit_model(
    "RMSE",
    additional_params={"train_dir": "RMSE_weigths"},
    train_pool=train_with_weights,
    test_pool=test_with_weights,
)

### Reducing ploblem, step 2

Now lets look at example of documents relevance:

$$ 
    \begin{align}
    labels(q_1) &= \begin{bmatrix}
           4 \\
           3 \\
           3 \\
           1
         \end{bmatrix},
    labels(q_2) &= \begin{bmatrix}
           2 \\
           1 \\
           1 \\
           0
         \end{bmatrix}
   \end{align}
$$

This means that with RMSE loss function we pay more attention to q1 than q2. 

To avoid this problem we introduce into RMSE a coefficient $c_q$ which depends only on query (and if fact equals to the mean of the difference between prediction and label).

$$\frac{1}{N}\sqrt{ \sum_q \sum_{d_{qk}} \left(f(d_{qk}) - l_{qk} - \color{red}{c_{q}} \right)^2 }$$

In [None]:
fit_model("QueryRMSE")

### Reducing problem, step 3

Since the goal of ranking is to predict a list of documents (which can be generated from given document relevances) RMSE loss function doesn't take into account relations between documents: the first is better than second, second is better than third and fifth etc.

We can easily bring this information into the loss function, reducing problem not to regression but classification for two documents $(d_i, d_j)$ -- does $i$th better than $j$th or not.

So we minimize the negative loglikelihood:

$$ - \sum_{i,j \in Pairs} \log \left( \frac{1}{1 + e^{-(f(d_i) - f(d_j))}} \right) $$

Methods based on pair comparisons called __pairwise__ in CatBoostRanker this objective called __PairLogit__.

There's no need to change the dataset CatBoost generate the pairs for us. The number of generating pairs managed via parameter max_size.

In [None]:
fit_model("PairLogit")

### Reducing problem, step 4, ~~LambdaRank~~ YetiRank

Previous loss function directly minimize 
Method __YetiRank__ take this effect into account and generates weights for pairs according to their positions ([paper](https://cache-mskstoredata08.cdn.yandex.net/download.yandex.ru/company/to_rank_challenge_with_yetirank.pdf)).

YetiRank:
$$ - \sum_{i,j \in Pairs} \color{red}{w_{ij}} \log \left( \frac{1}{1 + \exp{-(f(d_i) - f(d_j))}} \right) $$


In [None]:
fit_model('YetiRank')

### A special case: top-1 prediction with __QuerySoftMax__

$$
- \frac{
\sum_{\text{Group} \in \text{Groups}} \sum_{i \in \text{Group}} w_i t_i \log\left( \frac{w_i e^{\beta a_i}}{\sum_{j \in \text{Group}} w_j e^{\beta a_j}} \right)
}{
\sum_{\text{Group} \in \text{Groups}} \sum_{i \in \text{Group}} w_i t_i
}
$$


In [None]:
def get_best_documents(labels, queries):
    """relevance 1 for top outputs"""
    query_set = np.unique(queries)
    num_queries = query_set.shape[0]
    by_query_arg_max = {query: -1 for query in query_set}

    for i, query in enumerate(queries):
        best_idx = by_query_arg_max[query]
        if best_idx == -1 or labels[best_idx] < labels[i]:
            by_query_arg_max[query] = i

    binary_best_docs = np.zeros(shape=labels.shape)
    for arg_max in by_query_arg_max.values():
        binary_best_docs[arg_max] = 1.0

    return binary_best_docs


get_best_documents(y_test, queries_test)

In [None]:
best_docs_train = get_best_documents(y_train, queries_train)
best_docs_test = get_best_documents(y_test, queries_test)

train_with_weights = cb.Pool(
    data=X_train,
    label=best_docs_train,
    group_id=queries_train,
    group_weight=create_weights(queries_train),
)

test_with_weights = cb.Pool(
    data=X_test,
    label=best_docs_test,
    group_id=queries_test,
    group_weight=create_weights(queries_test),
)

fit_model("QuerySoftMax", train_pool=train_with_weights, test_pool=test_with_weights)

### Step 4.1

As in step 3.1 __YetiRankPairwise__ is slower than __YetiRank__, but gives more accurate results.

In [None]:
fit_model('YetiRankPairwise')

In [None]:
widget = MetricVisualizer(
    ['RMSE', 'QueryRMSE', 'PairLogit', 'PairLogitPairwise', 'YetiRank', 'YetiRankPairwise']
)
widget.start()

Look on NDCG metric of method YetiRank $-$ it's underfitted.

In [None]:
fit_model('YetiRank', {'train_dir': 'YetiRank-lr-0.3', 'learning_rate': 0.3})

In [None]:
widget = MetricVisualizer(['YetiRank', 'YetiRank-lr-0.3'])
widget.start()

### Additional parameters

__Metric period__

Period in iterations of calculation metrics. This parameter can speed up training process.

In [None]:
fit_model('YetiRank', {'metric_period': 50})

__Task type__

You can significantly speed up training procedure switching to gpu.

In [None]:
# fit_model('YetiRank', {'task_type': 'GPU'})

----
This notebook is heavily based on official [catboost tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/ranking/ranking_tutorial.ipynb).