# Stage 5 (3.3 catboost_2_stage_model_recs)

# Импортируем библиотеки

In [1]:
import warnings


# ----------------
# Data processing
# ----------------

import numpy as np
import polars as pl

from tqdm import tqdm

# ---------------------
# RecSys models imports
# ---------------------

from catboost import CatBoostRanker, Pool


# -------------------
# Utils
# -------------------
from utils import get_catboost_group_id, add_score_and_rank


warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Импортируем пути

In [2]:
data_path = "../data/closed/"
models_path = "../data/models/"
candidates_data_path = models_path + "candidates_data/"


# CatBoost Ranker


In [3]:
# Убираем таргет, айдишники
# и данные, на которых обучались модели первого уровня
FEATURES = [
    "cos_score",
    "cos_rank",
    "bm25_score",
    "bm25_rank",
    "tfidf_score",
    "tfidf_rank",
    "lfm_score",
    "lfm_rank",
    #---------------
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    #---------------
    "item_pop",
    "item_avg_hist",
    #---------------
    "title_len",
    "descr_len",
    "title_word_len",
    "descr_word_len",
    "txt_emb_pca_0",
    "txt_emb_pca_1",
    "txt_emb_pca_2",
    "txt_emb_pca_3",
    "txt_emb_pca_4",
    "txt_emb_pca_5",
    "txt_emb_pca_6",
    "txt_emb_pca_7",
    "txt_emb_pca_8",
    "txt_emb_pca_9",
    #---------------
    "img_emb_pca_0",
    "img_emb_pca_1",
    "img_emb_pca_2",
    "img_emb_pca_3",
    "img_emb_pca_4",
    "img_emb_pca_5",
    "img_emb_pca_6",
    "img_emb_pca_7",
    "img_emb_pca_8",
    "img_emb_pca_9",
    #---------------
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
]
# Из cols следующие фитчи численные
NUMERICAL_FEATURES = [
    "cos_score",
    "cos_rank",
    "bm25_score",
    "bm25_rank",
    "tfidf_score",
    "tfidf_rank",
    "lfm_score",
    "lfm_rank",
    #---------------
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    #---------------
    "item_pop",
    "item_avg_hist",
    #---------------
    "title_len",
    "descr_len",
    "title_word_len",
    "descr_word_len",
    "txt_emb_pca_0",
    "txt_emb_pca_1",
    "txt_emb_pca_2",
    "txt_emb_pca_3",
    "txt_emb_pca_4",
    "txt_emb_pca_5",
    "txt_emb_pca_6",
    "txt_emb_pca_7",
    "txt_emb_pca_8",
    "txt_emb_pca_9",
    #---------------
    "img_emb_pca_0",
    "img_emb_pca_1",
    "img_emb_pca_2",
    "img_emb_pca_3",
    "img_emb_pca_4",
    "img_emb_pca_5",
    "img_emb_pca_6",
    "img_emb_pca_7",
    "img_emb_pca_8",
    "img_emb_pca_9",
]
# Из cols следующие фитчи категориальные
CATEGORIAL_FEATURES = [
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
]

FEATURES_FOR_ANALYSIS = [
    "user_id",
    "item_id",
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    "title_len",
    "descr_len",
    "title_word_len",
    "descr_word_len",
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
    "item_pop",
    "item_avg_hist",
    # Next features will be added
    # after prediction and calculation of model's rank and score
    "listwise_hybrid_score",
    "listwise_hybrid_rank",
]

## Load 

In [4]:
# Загрузим listwise_model
ranker_model = CatBoostRanker().load_model(
    models_path + "CB_ranker_model_best_4.cbm"
)

In [5]:
ranker_model.get_params()

{'max_leaves': 40,
 'bootstrap_type': 'Bernoulli',
 'od_wait': 32,
 'verbose': 16,
 'iterations': 750,
 'custom_metric': ['NDCG:top=10', 'NDCG:top=15', 'NDCG:top=5', 'NDCG:top=3'],
 'grow_policy': 'Lossguide',
 'loss_function': 'YetiRank',
 'l2_leaf_reg': 1.5,
 'subsample': 0.9,
 'task_type': 'GPU',
 'depth': 8,
 'min_data_in_leaf': 124,
 'learning_rate': 0.195,
 'random_seed': 42}

## Выдаем рекомендации


In [6]:
ranker_data_bNr = pl.scan_parquet(data_path + "ranker_data_bNr.parquet")


In [7]:
n_splits = 5
batches = np.array_split(ranker_data_bNr.select("user_id").unique().collect().to_numpy().flatten(), n_splits)

In [8]:
for i in tqdm(range(n_splits)):

    # create test pool
    test_pool = Pool(
        data=ranker_data_bNr.filter(pl.col("user_id").is_in(batches[i]))
        .select(FEATURES)
        .collect()
        .to_pandas(),
        group_id=get_catboost_group_id(
            ranker_data_bNr.filter(pl.col("user_id").is_in(batches[i]))
        ),
        cat_features=CATEGORIAL_FEATURES,
    )

    # Get predictions
    y_pred: np.ndarray = ranker_model.predict(test_pool)

    # Add scores and ranks
    (
        add_score_and_rank(
            df=ranker_data_bNr.filter(pl.col("user_id").is_in(batches[i])),
            y_pred_scores=y_pred,
            name="listwise",
        ).select(FEATURES_FOR_ANALYSIS)
        # Save
        .write_parquet(candidates_data_path + f"CB_ranker_predictions_bNr_{i}.parquet")
    )

100%|██████████| 5/5 [01:53<00:00, 22.76s/it]


2 min 9 s 540K users --- batch = 54K users ~1.5GB RAM

1 min 50 s 540K users --- batch = 108K users ~3GB RAM
