# Stage 5 (2.1 data_preprocessing_for_2_stage_model)

# Импортируем библиотеки

In [1]:
from typing import Dict, Any
import warnings


# ----------------
# Data processing
# ----------------
import dill

import numpy as np
import pandas as pd
# import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from tqdm.auto import tqdm


warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Импортируем пути

In [2]:
data_path = "../data_closed/"

In [3]:
models_path = "../models/"

In [4]:
candidates_data_path = models_path + "candidates_data/"

## Transfrorm ITEMS data for RANKER

Немного информации из df_items, а так же преобразуем данную таблицу (закодируем категориальные признаки)

In [5]:
# Загружаем таблицу айтемов
with open(data_path + "df_items.dill", "rb") as f:
    df_items = dill.load(f)

In [None]:
df_items.columns

In [7]:
items_cat_cols = [
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
]

In [8]:
items_cat_enc = OrdinalEncoder(dtype=np.int64)
items_cat_enc.fit(df_items[items_cat_cols])

# Save encoder
with open(models_path + "df_items_encoder.dill", "wb") as f:
    dill.dump(items_cat_enc, f)

In [None]:
items_cat_enc = OrdinalEncoder(dtype=np.int64)
df_items[items_cat_cols] = items_cat_enc.transform(df_items[items_cat_cols])
display(df_items)

In [None]:
# Save
with open(data_path + "df_items_mod.dill", "wb") as f:
    dill.dump(df_items, f)

# Feature Engineering

In [None]:
# Загружаем таблицу данных для моделей первого уровня
with open(data_path + "base_models_data.dill", "rb") as f:
    base_models_data = dill.load(f)

# Загружаем таблицу айтемов
with open(data_path + "df_items_mod.dill", "rb") as f:
    df_items = dill.load(f)


In [None]:
base_models_data = base_models_data.rename(
    columns={
        # переименуем для удобства
        "u_total_inter": "user_hist",
    }
)

# Получаем популярность контента
base_models_data["item_pop"] = base_models_data.groupby("item_id")["user_id"].transform(
    "count"
)
# Получаем среднюю популярность контента, просматриваемого этим юзером
base_models_data["user_avg_pop"] = base_models_data.groupby("user_id")[
    "item_pop"
].transform("mean")

# Получаем среднюю длину истории пользователя, которые смотрит этот контент
base_models_data["item_avg_hist"] = base_models_data.groupby("item_id")[
    "user_hist"
].transform("mean")

# Получаем популярность последнего просмотренного контента
base_models_data.sort_values(
    by=["user_id", "dt"],
    ascending=[True, False],
    ignore_index=True,
    inplace=True,
)
base_models_data["user_last_pop"] = base_models_data.groupby("user_id")[
    "item_pop"
].transform("first")


base_models_data.head(3)

In [None]:
# Добавляем новые фичи в соответствующие таблицы
df_items = pd.merge(
    left=df_items,
    right=(
        base_models_data[["item_id", "item_pop", "item_avg_hist"]].drop_duplicates()
    ),
    how="left",
    on="item_id",
)

# Создаем таблицу с фитчами пользователей
df_users = base_models_data[
    ["user_id", "user_hist", "user_avg_pop", "user_last_pop"]
].drop_duplicates()

In [None]:
# Save updated tables

with open(data_path + "df_items_mod.dill", "wb") as f:
    dill.dump(df_items, f)

with open(data_path + "df_users.dill", "wb") as f:
    dill.dump(df_users, f)

## Load data

In [None]:
# Загружаем таблицу данных для ранкера
with open(data_path + "ranker_data.dill", "rb") as f:
    ranker_data = dill.load(f)


# Загружаем таблицу кандидатов
with open(candidates_data_path + "candidates_full.dill", "rb") as f:
    candidates_full = dill.load(f)

In [None]:
# Пользователи, которым надо выдавать пресказания для обучения ранкера,
# т.е. присутствуют и в base_models_data и в ranker_data (base to ranker users)
with open(data_path + "b2r_users.dill", "rb") as f:
    b2r_users = dill.load(f)


# Пользователи из test_df, которым будут выданы
# таргетирвонные рекомондации
with open(data_path + "bNr2t_users.dill", "rb") as f:
    bNr2t_users = dill.load(f)


In [None]:
default_values_candidates = {
    "cos_score": candidates_full["cos_score"].min(),
    "bm25_score": candidates_full["bm25_score"].min(),
    "tfidf_score": candidates_full["tfidf_score"].min(),
    "lfm_score": candidates_full["lfm_score"].min(),
    "cos_rank": candidates_full["cos_rank"].max(),
    "bm25_rank": candidates_full["bm25_rank"].max(),
    "tfidf_rank": candidates_full["tfidf_rank"].max(),
    "lfm_rank": candidates_full["lfm_rank"].max(),
}

# Модель второго уровня (ранкер)

## Ranker Data

### Remove unnecessary

In [None]:
# Оставим только необходимые параметры из таблицы

# Ранкер будем обучать на пользователях у кого длинная история взаимодействий
ranker_data = ranker_data[ranker_data["u_total_inter"] > 75][
    [
        "user_id",
        "item_id",
        # Так как бьем данные для tain val не по времени,
        # колонка "dt" не нужна
        # --------------------------
        # Потом будем использовать для ранкера чтобы задать таргет
        # (количество взаимодействий с предметом)
        "ui_inter",
        # --------------------------
        # Веса
        # "weight",
        # "cum_weight",
        # "rel_weight"
        # Убираем, т.к. они были получены из схожих соображений
        # и зависят от +- одинаковых фитчей
        # А на "rel_weight" обучалась модель первого уровня
        # так что далее он не нужен
        # --------------------------
        # Остальные колонки не нужны
        # Так как они были использованы для вывода весовых колонок,
        # либо присутствуют в фитчах пользователя или айтема
    ]
]

### Train \ Val \ Test Split

In [None]:
# Теперь ranker_data разбиваем по юзерам
# на train и val для обучения и валидации ранкера
train_size = 0.8
val_size = 0.2


ranker_train_users, ranker_val_users = train_test_split(
    ranker_data[ranker_data["user_id"].isin(b2r_users)]["user_id"],
    random_state=RANDOM_STATE,
    test_size=val_size,
)

# test-выборка у нас уже имеется 
# выборка пользователей присутствующих в base & ranker & test
# на них и будем проводить первичный тест системы
ranker_test_users = bNr2t_users

%clear

In [None]:
# Оставляем среди users только тех, для кого есть 
# и рекомендации и таргеты
def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame,
    df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Filters user interaction data and candidate recommendations, 
    ensuring each user has both interactions and recommendations.

    Args:
        user_list (np.ndarray): User IDs to include.
        candidates_df (pd.DataFrame): Candidate item recommendations 
            with ranks ('cos_rank', 'bm25_rank', 'lfm_rank', 'tfidf_rank').
        df (pd.DataFrame): User-item interactions ('user_id', 'item_id', 'dt', 
            and potentially other weight-based columns).

    Returns:
        pd.DataFrame: Filtered and merged DataFrame with user interactions 
            and candidate items sorted and with missing values filled. 
            It also filters down to items with at least one rank < 15
    """
    # For fillna
    default_values = {
        "ui_inter": 0,
        "weight": 0.0,
        "cum_weight": 0.0,
        **default_values_candidates,
    }

    # Get valid interactions
    df = df[df["user_id"].isin(user_list)]
    candidates_df = candidates_df[candidates_df["user_id"].isin(user_list)]

    # join interaction на наших кандидатов для users из train, val, test
    df = df.merge(
        candidates_df,
        how="outer",
        on=["user_id", "item_id"],
    )

    df.fillna(default_values, inplace=True)
    df["ui_inter"] = df["ui_inter"].astype(int)

    # Сортируем по user_id
    df.sort_values(
        by=["user_id", "item_id"],
        inplace=True,
    )
    
    return df[
        (df["cos_rank"] < 15)
        | (df["bm25_rank"] < 15)
        | (df["lfm_rank"] < 15)
        | (df["tfidf_rank"] < 15)
    ]


In [None]:
ranker_train = users_filter(ranker_train_users, candidates_full, ranker_data)

# Save 
with open(data_path + "ranker_train.dill", "wb") as f:
    dill.dump(ranker_train, f)

In [None]:
ranker_train.head(3)

In [None]:
ranker_val = users_filter(ranker_val_users, candidates_full, ranker_data)

# Save
with open(data_path + "ranker_val.dill", "wb") as f:
    dill.dump(ranker_val, f)

In [None]:
ranker_val.head(3)

In [None]:
ranker_test = users_filter(ranker_test_users, candidates_full, ranker_data)

# Save
with open(data_path + "ranker_test.dill", "wb") as f:
    dill.dump(ranker_test, f)

In [None]:
ranker_test.head(3)

In [None]:
ranker_train

In [None]:
ranker_val

## Добавим фитчи предметов и пользователей 

### Пользователей

In [None]:
# Загружаем таблицу фитчей пользователей
with open(data_path + "df_users.dill", "rb") as f:
    df_users = dill.load(f)

# Для новых фичей юзеров
default_values_users = {
    "user_hist": 0,
    "user_avg_pop": df_users["user_avg_pop"].median(),
    "user_last_pop": df_users["user_last_pop"].median(),
}

In [None]:
# Добавляем фичи
def add_users_features(
    df: pd.DataFrame,
    users: pd.DataFrame,
) -> pd.DataFrame:
    """
    Merges user and item features into a DataFrame, handling missing values.

    Args:
        df (pd.DataFrame): Interaction DataFrame ('user_id', 'item_id').
        users (pd.DataFrame): User features DataFrame ('user_id').
        items (pd.DataFrame): Item features DataFrame ('item_id').

    Returns:
        pd.DataFrame: DataFrame with merged user and item features, 
            and missing values filled.
    """
    users = users[users["user_id"].isin(df["user_id"])]
    df = pd.merge(df, users, how="left", on=["user_id"])

    # При джойне могут получиться строки
    # с несуществующими айтемами или юзерами.
    # Заполняем пропуски
    df.fillna(default_values_users, inplace=True)

    return df

In [None]:
# Загрузим таблицу ranker_train
with open(data_path + "ranker_train.dill", "rb") as f:
    ranker_train = dill.load(f) #pl.from_pandas(dill.load(f))

ranker_train = add_users_features(ranker_train, df_users)
# ranker_train = add_items_features(ranker_train, df_items)

# Save 
with open(data_path + "ranker_train.dill", "wb") as f:
    dill.dump(ranker_train, f)

In [None]:
# Загрузим таблицу ranker_val
with open(data_path + "ranker_val.dill", "rb") as f:
    ranker_val = dill.load(f)

ranker_val = add_users_features(ranker_val, df_users)
# ranker_val = add_items_features(ranker_val, df_users)

# Save
with open(data_path + "ranker_val.dill", "wb") as f:
    dill.dump(ranker_val, f)

In [None]:
# Загрузим таблицу ranker_test
with open(data_path + "ranker_test.dill", "rb") as f:
    ranker_test = dill.load(f)

ranker_test = add_users_features(ranker_test, df_users)
# ranker_test = add_items_features(ranker_test, df_users)

# Save
with open(data_path + "ranker_test.dill", "wb") as f:
    dill.dump(ranker_test, f)

### Предметов

In [None]:
# Загружаем таблицу айтемов
with open(data_path + "df_items_mod.dill", "rb") as f:
    df_items = dill.load(f)

# Для новых фичей айтемов
default_values_items = {
    "item_pop": df_items["item_pop"].median(),
    "item_avg_hist": df_items["item_avg_hist"].median(),
}

In [None]:
# Добавляем фичи
def add_items_features(
    df: pd.DataFrame,
    items: pd.DataFrame,
) -> pd.DataFrame:
    """
    Merges user and item features into a DataFrame, handling missing values.

    Args:
        df (pd.DataFrame): Interaction DataFrame ('user_id', 'item_id').
        items (pd.DataFrame): Item features DataFrame ('item_id').

    Returns:
        pd.DataFrame: DataFrame with merged user and item features,
            and missing values filled.
    """

    items = items[items["item_id"].isin(df["item_id"].unique())]
    df = pd.merge(df, items, how="left", on=["item_id"])

    # # При джойне могут получиться строки
    # # с несуществующими айтемами или юзерами.
    # # Заполняем пропуски
    df.fillna(default_values_items, inplace=True)

    return df

In [None]:
# Загрузим таблицу ranker_train
with open(data_path + "ranker_train.dill", "rb") as f:
    ranker_train = dill.load(f)

ranker_train = add_items_features(ranker_train, df_items)

# Save 
with open(data_path + "ranker_train_final.dill", "wb") as f:
    dill.dump(ranker_train, f)

In [None]:
# Загрузим таблицу ranker_val
with open(data_path + "ranker_val.dill", "rb") as f:
    ranker_val = dill.load(f)

# ranker_val = add_users_features(ranker_val, df_users)
ranker_val = add_items_features(ranker_val, df_items)

# Save
with open(data_path + "ranker_val_final.dill", "wb") as f:
    dill.dump(ranker_val, f)

In [None]:
# Загрузим таблицу ranker_test
with open(data_path + "ranker_test.dill", "rb") as f:
    ranker_test = dill.load(f)

# ranker_test = add_users_features(ranker_test, df_users)
ranker_test = add_items_features(ranker_test, df_items)

# Save
with open(data_path + "ranker_test_final.dill", "wb") as f:
    dill.dump(ranker_test, f)

## Добавим таргет

In [None]:
def add_target(df: pd.DataFrame) -> pd.DataFrame:

    df["target"] = np.where(df["ui_inter"] > 1, 2, 1)
    df["target"] = np.where(df["ui_inter"] > 2, 4, df["target"])
    df["target"] = np.where(df["ui_inter"] > 4, 8, df["target"])
    df["target"] = np.where(df["ui_inter"] > 6, 10, df["target"])
    df["target"] = df["target"].astype(int)

    return df

In [None]:
# Загрузим таблицу ranker_train
with open(data_path + "ranker_train_final.dill", "rb") as f:
    ranker_train = dill.load(f)

# Загрузим таблицу ranker_val
with open(data_path + "ranker_val_final.dill", "rb") as f:
    ranker_val = dill.load(f)

# Загрузим таблицу ranker_test
with open(data_path + "ranker_test_final.dill", "rb") as f:
    ranker_test = dill.load(f)

In [None]:
ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

In [None]:
# Save 
with open(data_path + "ranker_train_final.dill", "wb") as f:
    dill.dump(ranker_train, f)

# Save
with open(data_path + "ranker_val_final.dill", "wb") as f:
    dill.dump(ranker_val, f)

# Save
with open(data_path + "ranker_test_final.dill", "wb") as f:
    dill.dump(ranker_test, f)