# Stage 5 (3.3 catboost_2_stage_model_recs)

# Импортируем библиотеки

In [1]:
import warnings


# ----------------
# Data processing
# ----------------
import dill

import numpy as np
import pandas as pd

from tqdm.auto import tqdm


# ---------------------
# RecSys models imports
# ---------------------

from catboost import CatBoostRanker, Pool


# -------------------
# Utils
# -------------------
from utils import get_catboost_group_id, add_score_and_rank


warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Импортируем пути

In [2]:
data_path = "../data_closed/"

In [3]:
models_path = "../models/"

In [4]:
candidates_data_path = models_path + "candidates_data/"

# CatBoost Ranker


In [5]:
# Убираем таргет, айдишники
# и данные, на которых обучались модели первого уровня
FEATURES = [
    "cos_score",
    "cos_rank",
    "bm25_score",
    "bm25_rank",
    "tfidf_score",
    "tfidf_rank",
    "lfm_score",
    "lfm_rank",
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    "title_len",
    "descr_len",
    "title_word_len",
    "descr_word_len",
    "txt_emb_pca_0",
    "txt_emb_pca_1",
    "txt_emb_pca_2",
    "txt_emb_pca_3",
    "txt_emb_pca_4",
    "txt_emb_pca_5",
    "txt_emb_pca_6",
    "txt_emb_pca_7",
    "txt_emb_pca_8",
    "txt_emb_pca_9",
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
    "img_pca_0",
    "img_pca_1",
    "img_pca_2",
    "img_pca_3",
    "img_pca_4",
    "img_pca_5",
    "img_pca_6",
    "img_pca_7",
    "img_pca_8",
    "img_pca_9",
    "item_pop",
    "item_avg_hist",
]
# Из cols следующие фитчи категориальные
CATEGORIAL_FEATURES = [
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
]

FEATURES_FOR_ANALYSIS = [
    "user_id",
    "item_id",
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    "title_len",
    "descr_len",
    "title_word_len",
    "descr_word_len",
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
    "item_pop",
    "item_avg_hist",
    # Next features will be added
    # after prediction and calculation of model's rank and score
    # "listwise_hybrid_score",
    # "listwise_hybrid_rank",
]

## Load 

In [15]:
# Загрузим listwise_model
with open(models_path + "CB_ranker_model.dill", "rb") as f:
    model = dill.load(f)

# Загрузим listwise_model
with open(data_path + "ranker_train.dill", "rb") as f:
    valid_items = dill.load(f)["item_id"].unique()

In [7]:
model.get_params()

{'iterations': 3500,
 'learning_rate': 0.07,
 'loss_function': 'YetiRank',
 'verbose': 16,
 'custom_metric': ['NDCG:top=10', 'NDCG:top=15', 'NDCG:top=5', 'NDCG:top=3'],
 'task_type': 'GPU',
 'max_depth': 6,
 'random_state': 42,
 'reg_lambda': 0.9,
 'objective': 'YetiRank',
 'early_stopping_rounds': 32,
 'grow_policy': 'Lossguide',
 'min_child_samples': 100,
 'num_leaves': 24}

## Выдаем рекомендации


In [18]:
for i in tqdm([0, 20, 40, 60, 80]):

    # Load data
    with open(data_path + f"ranker_test_bNr_{i}.dill", "rb") as f:
        ranker_test = dill.load(f)
        ranker_test = ranker_test[ranker_test["item_id"].isin(valid_items)]
        for cat in CATEGORIAL_FEATURES:
            ranker_test[cat] = ranker_test[cat].astype(int)

    # create test pool
    test_pool = Pool(
        data=ranker_test[FEATURES],
        group_id=get_catboost_group_id(ranker_test),
        cat_features=CATEGORIAL_FEATURES,
    )

    # Get predictions
    y_pred: np.ndarray = model.predict(test_pool)

    # Add scores and ranks
    ranker_test = add_score_and_rank(ranker_test, y_pred, "listwise")

    # Dump listwise_model predictions for users
    with open(candidates_data_path + f"CB_ranker_predictions_bNr_{i}.dill", "wb") as f:
        dill.dump(ranker_test[FEATURES_FOR_ANALYSIS], f)

  0%|          | 0/4 [00:00<?, ?it/s]