### &#x1F3C5; FIRST TRAINING WITH XGBRANKER

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [87]:
df_train = pd.read_csv('../data/train_10k.csv').drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
df_val = pd.read_csv('../data/val_10k.csv').drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [97]:
df_train.to_csv('../data/train_10k.csv')
df_val.to_csv('../data/val_10k.csv')

In [89]:
import xgboost as xgb

In [90]:
from sklearn.metrics import make_scorer

In [91]:
# metric
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

#def kendall_tau_for_xgb(y_pred, dtrain):
    #labels = dtrain.get_label()
    #return 'kendall_tau', kendall_tau(labels, y_pred)

kendall_tau_metric = make_scorer(kendall_tau)

In [92]:
X_train = df_train.drop(columns='rank')
X_val = df_val.drop(columns='rank')

y_train = df_train['rank']
y_val = df_val['rank']

# Groupes pour l'entraînement
groups_train = df_train.groupby('id').size().to_numpy()
groups_val = df_val.groupby('id').size().to_numpy()

model = xgb.XGBRanker(objective='rank:pairwise', min_child_weight= 10, subsample= 0.5, tree_method= 'hist')

# Entraîner le modèle
X_train.drop(columns=['id', 'cell_id'], inplace=True)
X_val.drop(columns=['id', 'cell_id'], inplace=True)
model.fit(X_train, y_train, group=groups_train)

#### &#x1F3C7; L'entrainement est très rapide car on fait du data leakage intentionnellement sur 66% de la donnée, on donne le rang des cellules de code. 

#### 66% code cells, 34% markdown

In [93]:
y_pred = model.predict(X_val)

In [94]:
predict = pd.DataFrame({"cell_id" : df_val['cell_id'],
                           "pred" : y_pred, 'id' : df_val['id']})
predict.set_index('id', inplace=True)

In [95]:
# Sort (using the predicted rank) and then group
predict = predict.sort_values(by = ['id', 'pred'], ascending = [False, True]).groupby('id')['cell_id'].apply(list)
# Create the same but for actual data
actual = df_val.sort_values(by = ['id', 'rank'], ascending = [False, True]).groupby('id')['cell_id'].apply(list)
baseline = kendall_tau(actual, predict)

In [96]:
baseline

0.5726563188617604

#### &#x1F622; Le score est très nul 