# Stage 5 (3.1 bNr_1_stage_models)

In [1]:
import warnings


# ----------------
# Data processing
# ----------------
import dill

import numpy as np
import pandas as pd


from tqdm.auto import tqdm


# ---------------------
# RecSys models imports
# ---------------------
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    LightFMWrapperModel,
    implicit_knn,
)
from implicit import nearest_neighbours


warnings.filterwarnings("ignore")

RANDOM_STATE = 42

## Data paths

In [2]:
data_path = "../data_closed/"

In [3]:
models_path = "../models/"

In [4]:
candidates_data_path = models_path + "candidates_data/"

# Load Data & create Rectools Dataset

Выборки по пользователям (USER_ID), участвующим в этапе обучения моделей первого уровня

In [5]:
# Пользователи из test_df, которым будут выданы
# таргетирвонные рекомондации
with open(data_path + "bNr2t_users.dill", "rb") as f:
    bNr2t_users = dill.load(f)

bNr2t_users, bNr2t_users.shape

(array([4194304, 2097153, 3145730, ..., 3145714, 3145717, 2097143]), (540967,))

In [6]:
with (
    # Загружаем таблицу данных для моделей первого уровня
    open(data_path + "base_models_data.dill", "rb") as base_f,
    # Загружаем таблицу данных для ранкера
    open(data_path + "ranker_data.dill", "rb") as ranker_f,
):

    # Создадим датасет взаимодействий
    current_dataset = RTDataset.construct(
        interactions_df=pd.concat(
            [
                dill.load(base_f)[["user_id", "item_id", "dt", "cum_weight"]],
                dill.load(ranker_f)[["user_id", "item_id", "dt", "cum_weight"]],
            ],
            axis=0,
        ).rename(
            columns={
                "user_id": Columns.User,
                "item_id": Columns.Item,
                "dt": Columns.Datetime,
                "cum_weight": Columns.Weight,
            }
        )
    )

# Обучение моделей первого уровня для обучения ранкера

## Rectools Implicit


In [7]:
knn_impl_cos_k50 = implicit_knn.ImplicitItemKNNWrapperModel(
    model=nearest_neighbours.CosineRecommender(K=50)
)

knn_impl_bm25_k50 = implicit_knn.ImplicitItemKNNWrapperModel(
    model=nearest_neighbours.BM25Recommender(K=50)
)

knn_impl_tfidf_k50 = implicit_knn.ImplicitItemKNNWrapperModel(
    model=nearest_neighbours.TFIDFRecommender(K=50)
)

In [8]:
# Fit models
knn_impl_cos_k50.fit(current_dataset)
knn_impl_bm25_k50.fit(current_dataset)
knn_impl_tfidf_k50.fit(current_dataset)

%clear

[H[2J

In [9]:
# Save models

with open(models_path + "knn_impl_cos_k50.dill", "wb") as f:
    dill.dump(knn_impl_cos_k50, f)

with open(models_path + "knn_impl_bm25_k50.dill", "wb") as f:
    dill.dump(knn_impl_bm25_k50, f)

with open(models_path + "knn_impl_tfidf_k50.dill", "wb") as f:
    dill.dump(knn_impl_tfidf_k50, f)

## Rectools LightFM

In [10]:
# Задаем модель
lfm_model = LightFMWrapperModel(
    LightFM(
        no_components=64,
        learning_rate=0.1,
        loss="warp",
        max_sampled=7,
    ),
    epochs=20,
    num_threads=6,
    verbose=1,
)

In [11]:
lfm_model.fit(dataset=current_dataset)
%clear

Epoch: 100%|██████████| 20/20 [09:25<00:00, 28.29s/it]

[H[2J




In [12]:
# Save model
with open(models_path + "lfm_model.dill", "wb") as f:
    dill.dump(lfm_model, f)

## Получим кандидатов для переранжирования ранкером

In [None]:
models_path = "../models/"

In [11]:
candidates_data_path = models_path + "candidates_data/"

### Cosine Recommender

In [12]:
# Cosine Recommender
with open(models_path + "knn_impl_cos_k50.dill", "rb") as f:
    knn_impl_cos_k50 = dill.load(f)

In [None]:
candidates_cos = knn_impl_cos_k50.recommend(
    bNr2t_users,
    current_dataset,
    # выдаем 20 кандидатов
    k=20,
    # рекомендуем уже просмотренные товары
    filter_viewed=False,
)
candidates_cos = candidates_cos.rename(
    columns={
        "score": "cos_score",
        "rank": "cos_rank",
    }
)

candidates_cos

In [14]:
# Save Cosine Model candidates
with open(candidates_data_path + "candidates_cos.dill", "wb") as f:
    dill.dump(candidates_cos, f)

### BM25 Recommender


In [15]:
# BM25 Recommender
with open(models_path + "knn_impl_bm25_k50.dill", "rb") as f:
    knn_impl_bm25_k50 = dill.load(f)

In [None]:
candidates_bm25 = knn_impl_bm25_k50.recommend(
    bNr2t_users,
    current_dataset,
    # выдаем 20 кандидатов
    k=20,
    # рекомендуем уже просмотренные товары
    filter_viewed=False,
)
candidates_bm25 = candidates_bm25.rename(
    columns={
        "score": "bm25_score",
        "rank": "bm25_rank",
    }
)
candidates_bm25

In [17]:
# Save BM25 Model candidates
with open(candidates_data_path + "candidates_bm25.dill", "wb") as f:
    dill.dump(candidates_bm25, f)

### TFIDF Recommender


In [18]:
# TFIDF Recommender
with open(models_path + "knn_impl_tfidf_k50.dill", "rb") as f:
    knn_impl_tfidf_k50 = dill.load(f)

In [None]:
candidates_tfidf = knn_impl_tfidf_k50.recommend(
    bNr2t_users,
    current_dataset,
    # выдаем 20 кандидатов
    k=20,
    # рекомендуем уже просмотренные товары
    filter_viewed=False,
)
candidates_tfidf = candidates_tfidf.rename(
    columns={
        "score": "tfidf_score",
        "rank": "tfidf_rank",
    }
)
candidates_tfidf

In [20]:
# Save TFIDF Model candidates
with open(candidates_data_path + "candidates_tfidf.dill", "wb") as f:
    dill.dump(candidates_tfidf, f)

### LightFM Recommender


In [21]:
# LightFM Recommender
with open(models_path + "lfm_model.dill", "rb") as f:
    lfm_model = dill.load(f)

In [None]:
candidates_lfm = lfm_model.recommend(
    bNr2t_users,
    current_dataset,
    # выдаем 20 кандидатов
    k=20,
    # рекомендуем уже просмотренные товары
    filter_viewed=False,
)
candidates_lfm = candidates_lfm.rename(
    columns={
        "score": "lfm_score",
        "rank": "lfm_rank",
    }
)
candidates_lfm

In [23]:
# Save LightFM Model candidates
with open(candidates_data_path + "candidates_lfm.dill", "wb") as f:
    dill.dump(candidates_lfm, f)

# Сливаем всех кандидатов в одну таблицу

In [5]:
models_path = "../models/"

In [6]:
candidates_data_path = models_path + "candidates_data/"

In [7]:
with open(candidates_data_path + "candidates_cos.dill", "rb") as f:
    candidates_cos = dill.load(f)
    candidates_cos = candidates_cos[candidates_cos["cos_rank"] < 15]

with open(candidates_data_path + "candidates_bm25.dill", "rb") as f:
    candidates_bm25 = dill.load(f)
    candidates_bm25 = candidates_bm25[candidates_bm25["bm25_rank"] < 15]

with open(candidates_data_path + "candidates_tfidf.dill", "rb") as f:
    candidates_tfidf = dill.load(f)
    candidates_tfidf = candidates_tfidf[candidates_tfidf["tfidf_rank"] < 15]

with open(candidates_data_path + "candidates_lfm.dill", "rb") as f:
    candidates_lfm = dill.load(f)
    candidates_lfm = candidates_lfm[candidates_lfm["lfm_rank"] < 15]

In [8]:
candidates_list = [
    candidates_cos,
    candidates_bm25,
    candidates_tfidf,
    candidates_lfm,
]

In [9]:
for df in candidates_list:
    print(df.shape)

(7564556, 4)
(7564556, 4)
(7564556, 4)
(7573538, 4)


## Concatenate

In [10]:
candidates = candidates_list[0].copy()

for df in candidates_list[1:]:
    candidates = pd.concat(
        [
            candidates.set_index(["user_id", "item_id"]),
            df.set_index(["user_id", "item_id"]),
        ],
        join="outer",
        axis=1,
    ).reset_index()

In [11]:
#Check shape
candidates.shape

(15969218, 10)

In [None]:
candidates.head(10)


### Fill NaN

In [13]:
default_values_merged = {
    "cos_score": candidates["cos_score"].min() - 0.01,
    "bm25_score": candidates["bm25_score"].min() - 0.01,
    "tfidf_score": candidates["tfidf_score"].min() - 0.01,
    "lfm_score": candidates["lfm_score"].min() - 0.01,
    "cos_rank": candidates["cos_rank"].max() + 1,
    "bm25_rank": candidates["bm25_rank"].max() + 1,
    "tfidf_rank": candidates["tfidf_rank"].max() + 1,
    "lfm_rank": candidates["lfm_rank"].max() + 1,
}

In [None]:
candidates.fillna(default_values_merged, inplace=True)
candidates.head(10)

In [15]:
# Checkpoint
with open(candidates_data_path + "candidates_bNr_full.dill", "wb") as f:
    dill.dump(candidates, f)