In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

---

Загружу данные и посмотрю на них

In [2]:
df = pd.read_csv('intern_task.csv')
df.head()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0


In [3]:
df.shape

(235258, 146)

Проверю, есть ли пропуски

In [4]:
df.isnull().sum().unique()

array([0], dtype=int64)

Пропусков нет - приятно. Посмотрю на множества значения признаков

In [5]:
for col in df.columns:
    print(f"{col}:", len(df[col].unique()))

rank: 5
query_id: 2000
feature_0: 16
feature_1: 10
feature_2: 15
feature_3: 10
feature_4: 16
feature_5: 44
feature_6: 25
feature_7: 32
feature_8: 235258
feature_9: 44
feature_10: 5124
feature_11: 156
feature_12: 281
feature_13: 60
feature_14: 5174
feature_15: 1903
feature_16: 1790
feature_17: 1809
feature_18: 1813
feature_19: 1901
feature_20: 235258
feature_21: 54
feature_22: 73
feature_23: 13
feature_24: 532
feature_25: 215
feature_26: 26
feature_27: 24
feature_28: 10
feature_29: 216
feature_30: 387
feature_31: 35
feature_32: 60
feature_33: 11
feature_34: 386
feature_35: 235258
feature_36: 125
feature_37: 153
feature_38: 53
feature_39: 1464
feature_40: 8204
feature_41: 226
feature_42: 290
feature_43: 59
feature_44: 8696
feature_45: 47103
feature_46: 555
feature_47: 700
feature_48: 235258
feature_49: 50128
feature_50: 22222
feature_51: 298
feature_52: 360
feature_53: 77
feature_54: 24042
feature_55: 37626
feature_56: 435
feature_57: 563
feature_58: 111
feature_59: 39618
feature_60: 391

Вижу, что многие признаки категориальные (ничего делать не нужно, так как я буду использовать CatBoost), а также есть признаки с единственным значением - уберу их

In [6]:
for col in df.columns:
    if len(df[col].unique()) == 1:
        df.drop([col], axis=1, inplace=True)

Удалю уникальные 'query_id', если такие есть

In [7]:
def drop_unique_queries(df):
    group_sizes = df.groupby('query_id').size()
    valid_groups = group_sizes[group_sizes > 1].index
    
    return df[df['query_id'].isin(valid_groups)]

In [8]:
df = drop_unique_queries(df)
df.shape

(235255, 142)

Нашлось целых три - и на том спасибо. Также отмечу, что не нужны ни нормализация, ни стандартизация, так как CatBoost под капотом использует градиентный бустинг на решающих деревьях, которые устойчивы к различиям в масштабе признаков.

Теперь можно перейти к обучению модели. Для начала разделю выборки на обучающую и тестовую (валидационной не будет, потому что я не знаком с гиперпараметрами CatBoostRanker и не смогу тюнить их умнее, чем перебором по сетке, что было бы долго и неинтересно)

In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=26)

# группировка объектов по сессиям:
train_df.sort_values(by=['query_id'], inplace=True)
test_df.sort_values(by=['query_id'], inplace=True)

X_train, y_train, queries_train = train_df.drop(['rank', 'query_id'], axis=1), train_df['rank'], train_df['query_id']
X_test, y_test, queries_test = test_df.drop(['rank', 'query_id'], axis=1), test_df['rank'], test_df['query_id']

Возможно здесь тоже получились уникальные 'query_id'. Если их будет немного (менее 0.1%), то просто удалю их (не стану придумывать сложных методов для разделения данных)

In [10]:
print(train_df.shape[0], test_df.shape[0])

train_df = drop_unique_queries(train_df)
test_df = drop_unique_queries(test_df)

print(train_df.shape[0], test_df.shape[0])

188204 47051
188203 47032


Построю модель

In [11]:
train_pool = Pool(
    data=X_train.values,
    label=y_train.values,
    group_id=queries_train.values
)

test_pool = Pool(
    data=X_test.values,
    label=y_test.values,
    group_id=queries_test.values
)

In [12]:
model = CatBoostRanker()
model.fit(train_pool)
predictions = model.predict(test_pool)

0:	total: 382ms	remaining: 6m 21s
1:	total: 614ms	remaining: 5m 6s
2:	total: 849ms	remaining: 4m 42s
3:	total: 1.08s	remaining: 4m 30s
4:	total: 1.32s	remaining: 4m 21s
5:	total: 1.55s	remaining: 4m 16s
6:	total: 1.77s	remaining: 4m 11s
7:	total: 2.02s	remaining: 4m 10s
8:	total: 2.24s	remaining: 4m 6s
9:	total: 2.46s	remaining: 4m 3s
10:	total: 2.68s	remaining: 4m 1s
11:	total: 2.92s	remaining: 4m
12:	total: 3.15s	remaining: 3m 59s
13:	total: 3.37s	remaining: 3m 57s
14:	total: 3.61s	remaining: 3m 56s
15:	total: 3.84s	remaining: 3m 56s
16:	total: 4.07s	remaining: 3m 55s
17:	total: 4.3s	remaining: 3m 54s
18:	total: 4.52s	remaining: 3m 53s
19:	total: 4.75s	remaining: 3m 52s
20:	total: 5s	remaining: 3m 52s
21:	total: 5.24s	remaining: 3m 52s
22:	total: 5.47s	remaining: 3m 52s
23:	total: 5.76s	remaining: 3m 54s
24:	total: 6.02s	remaining: 3m 54s
25:	total: 6.25s	remaining: 3m 54s
26:	total: 6.49s	remaining: 3m 53s
27:	total: 6.72s	remaining: 3m 53s
28:	total: 6.96s	remaining: 3m 53s
29:	tot

Посчитаю NDCG@5

In [16]:
def calculate_ndcg5(true_relevance, predictions, groups):
    unique_groups = np.unique(groups)
    score, count = 0, 0
    
    for group in unique_groups:
        group_mask = groups == group
        if len(true_relevance[group_mask]) > 1:
            count += 1
            score += ndcg_score([true_relevance[group_mask]], [predictions[group_mask]])
    
    return score / count

In [17]:
print("NDCG@5 score:", calculate_ndcg5(y_test, predictions, queries_test))

NDCG@5 score: 0.7366217489349192
