# Stage 5 (3.1 bNr_1_stage_models)

In [1]:
import warnings


# ----------------
# Data processing
# ----------------
import dill

import polars as pl

# ---------------------
# RecSys models imports
# ---------------------
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    LightFMWrapperModel,
    implicit_knn,
)
from implicit import nearest_neighbours


warnings.filterwarnings("ignore")

RANDOM_STATE = 42

## Data paths

In [2]:
data_path = "../data/closed/"
models_path = "../data/models/"
candidates_data_path = models_path + "candidates_data/"


# Load Data & create Rectools Dataset

Выборки по пользователям (USER_ID), участвующим в этапе обучения моделей первого уровня

In [3]:
# Пользователи из test_df, которым будут выданы
# таргетирвонные рекомондации
with open(data_path + "bNr2t_users.dill", "rb") as f:
    bNr2t_users = dill.load(f)

bNr2t_users, bNr2t_users.shape

(array([4194304, 2097153, 3145730, ..., 3145714, 3145717, 2097143]), (540967,))

In [4]:
base_models_data = pl.scan_parquet(data_path + "base_models_data.parquet")
ranker_data = pl.scan_parquet(data_path + "ranker_data.parquet")

In [5]:
# Создадим датасет взаимодействий
current_dataset = RTDataset.construct(
    # Изменим датасет `base_models_data` под стандарт `rectools`
    # Оставим только нужные колонки и переименуем
    interactions_df=(
        pl.concat(
            [
                base_models_data.select(
                    [
                        "user_id",
                        "item_id",
                        "dt",
                        "cum_weight",
                    ]
                ),
                ranker_data.select(
                    [
                        "user_id",
                        "item_id",
                        "dt",
                        "cum_weight",
                    ]
                ),
            ],
            how="vertical",
        )
        .rename(
            {
                "user_id": Columns.User,
                "item_id": Columns.Item,
                "dt": Columns.Datetime,
                "cum_weight": Columns.Weight,
            }
        )
        .collect()
        # преобразуем в формат pandas
        .to_pandas()
    )
)

8 s

# Обучение моделей первого уровня для обучения ранкера

## Rectools Implicit


In [6]:
knn_impl_cos_k50 = implicit_knn.ImplicitItemKNNWrapperModel(
    model=nearest_neighbours.CosineRecommender(K=50)
)

knn_impl_bm25_k50 = implicit_knn.ImplicitItemKNNWrapperModel(
    model=nearest_neighbours.BM25Recommender(K=50)
)

knn_impl_tfidf_k50 = implicit_knn.ImplicitItemKNNWrapperModel(
    model=nearest_neighbours.TFIDFRecommender(K=50)
)

In [7]:
# Fit models
knn_impl_cos_k50.fit(current_dataset)
knn_impl_bm25_k50.fit(current_dataset)
knn_impl_tfidf_k50.fit(current_dataset)

%clear

[H[2J

1 m 4 s for 3 models

~21 s for 1 model

In [8]:
# Save models

with open(models_path + "knn_impl_cos_k50.dill", "wb") as f:
    dill.dump(knn_impl_cos_k50, f)
    del knn_impl_cos_k50

with open(models_path + "knn_impl_bm25_k50.dill", "wb") as f:
    dill.dump(knn_impl_bm25_k50, f)
    del knn_impl_bm25_k50


with open(models_path + "knn_impl_tfidf_k50.dill", "wb") as f:
    dill.dump(knn_impl_tfidf_k50, f)
    del knn_impl_tfidf_k50


3 s

## Rectools LightFM

In [9]:
# Задаем модель
lfm_model = LightFMWrapperModel(
    LightFM(
        no_components=64,
        learning_rate=0.1,
        loss="warp",
        max_sampled=7,
    ),
    epochs=20,
    num_threads=6,
    verbose=1,
)

In [10]:
lfm_model.fit(dataset=current_dataset)
%clear

Epoch: 100%|██████████| 20/20 [10:59<00:00, 32.98s/it]

[H[2J




11 m 

In [11]:
# Save model
with open(models_path + "lfm_model.dill", "wb") as f:
    dill.dump(lfm_model, f)
    del lfm_model

3 s

## Получим кандидатов для переранжирования ранкером

In [12]:
models_path = "../data/models/"
candidates_data_path = models_path + "candidates_data/"


### Cosine Recommender

In [13]:
# Cosine Recommender
with open(models_path + "knn_impl_cos_k50.dill", "rb") as f:
    pl.from_pandas(
        dill.load(f).recommend(
            bNr2t_users,
            current_dataset,
            # выдаем 15 кандидатов
            k=15,
            # рекомендуем уже просмотренные товары
            filter_viewed=False,
        )
    ).rename(
        {
            "score": "cos_score",
            "rank": "cos_rank",
        }
    ).write_parquet(
        candidates_data_path + "candidates_cos.parquet"
    )

1 m 5 s

### BM25 Recommender


In [14]:
# BM25 Recommender
with open(models_path + "knn_impl_bm25_k50.dill", "rb") as f:
    pl.from_pandas(
        dill.load(f).recommend(
            bNr2t_users,
            current_dataset,
            # выдаем 15 кандидатов
            k=15,
            # рекомендуем уже просмотренные товары
            filter_viewed=False,
        )
    ).rename(
        {
            "score": "bm25_score",
            "rank": "bm25_rank",
        }
    ).write_parquet(
        candidates_data_path + "candidates_bm25.parquet"
    )

1 m 4 s

### TFIDF Recommender


In [15]:
# TFIDF Recommender
with open(models_path + "knn_impl_tfidf_k50.dill", "rb") as f:
    pl.from_pandas(
        dill.load(f).recommend(
            bNr2t_users,
            current_dataset,
            # выдаем 15 кандидатов
            k=15,
            # рекомендуем уже просмотренные товары
            filter_viewed=False,
        )
    ).rename(
        {
            "score": "tfidf_score",
            "rank": "tfidf_rank",
        }
    ).write_parquet(
        candidates_data_path + "candidates_tfidf.parquet"
    )

1 m 3 s

### LightFM Recommender


In [6]:
# LightFM Recommender
with open(models_path + "lfm_model.dill", "rb") as f:
    pl.from_pandas(
        dill.load(f).recommend(
            bNr2t_users,
            current_dataset,
            # выдаем 15 кандидатов
            k=15,
            # рекомендуем уже просмотренные товары
            filter_viewed=False,
        )
    ).rename(
        {
            "score": "lfm_score",
            "rank": "lfm_rank",
        }
    ).write_parquet(
        candidates_data_path + "candidates_lfm.parquet"
    )

6 m 

# Сливаем всех кандидатов в одну таблицу

In [11]:
models_path = "../data/models/"

In [12]:
candidates_data_path = models_path + "candidates_data/"

In [13]:
default_values_merged = {
    "cos_score": pl.col("cos_score").min() - 0.01,
    "bm25_score": pl.col("bm25_score").min() - 0.01,
    "tfidf_score": pl.col("tfidf_score").min() - 0.01,
    "lfm_score": pl.col("lfm_score").min() - 0.01,
    "cos_rank": pl.col("cos_rank").max() + 1,
    "bm25_rank": pl.col("bm25_rank").max() + 1,
    "tfidf_rank": pl.col("tfidf_rank").max() + 1,
    "lfm_rank": pl.col("lfm_rank").max() + 1,
}

In [14]:
candidates_list = ["cos", "bm25", "tfidf", "lfm"]

candidates = pl.scan_parquet(
    candidates_data_path + f"candidates_{candidates_list[0]}.parquet"
).filter(pl.col(f"{candidates_list[0]}_rank") < 15)

for cand in candidates_list[1:]:
    candidates = (
        candidates.join(
            other=pl.scan_parquet(
                candidates_data_path + f"candidates_{cand}.parquet"
            ).filter(pl.col(f"{cand}_rank") < 15),
            how="outer",
            on=["user_id", "item_id"],
        )
        .with_columns(
            pl.col("user_id").fill_null(pl.col("user_id_right")),
            pl.col("item_id").fill_null(pl.col("item_id_right")),
        )
        .drop(["user_id_right", "item_id_right"])
    )


candidates.collect().with_columns(
    (
        pl.col(col_name).fill_null(default_values_merged[col_name])
        for col_name in default_values_merged.keys()
    )
).write_parquet(candidates_data_path + "candidates_full.parquet")

12.5 s