# Stage 5 (2.1 data_preprocessing_for_2_stage_model)

# Импортируем библиотеки

In [1]:
from typing import Tuple, List, Optional

import warnings

# ----------------
# Data processing
# ----------------
import dill

import numpy as np
import polars as pl

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder


warnings.filterwarnings("ignore")

RANDOM_STATE = 42

# Импортируем пути

In [2]:
data_path = "../data/closed/"
models_path = "../data/models/"
candidates_data_path = models_path + "candidates_data/"


## Transfrorm ITEMS data for RANKER

Немного информации из df_items, а так же преобразуем данную таблицу (закодируем категориальные признаки)

In [3]:
# Загружаем таблицу айтемов
df_items = pl.scan_parquet(data_path + "df_items.parquet")
df_items

In [4]:
df_items.schema

Schema([('item_id', Int64),
        ('brand', String),
        ('closure', String),
        ('country', String),
        ('cut', String),
        ('height', String),
        ('length', String),
        ('material', String),
        ('model', String),
        ('neckline', String),
        ('pattern', String),
        ('pocket', String),
        ('purpose', String),
        ('sleeve', String),
        ('color', Int64),
        ('title_len', UInt32),
        ('descr_len', UInt32),
        ('title_word_len', UInt32),
        ('descr_word_len', UInt32),
        ('txt_emb_pca_0', Float64),
        ('txt_emb_pca_1', Float64),
        ('txt_emb_pca_2', Float64),
        ('txt_emb_pca_3', Float64),
        ('txt_emb_pca_4', Float64),
        ('txt_emb_pca_5', Float64),
        ('txt_emb_pca_6', Float64),
        ('txt_emb_pca_7', Float64),
        ('txt_emb_pca_8', Float64),
        ('txt_emb_pca_9', Float64),
        ('img_emb_pca_0', Float64),
        ('img_emb_pca_1', Float64),
        ('img

In [5]:
def encode_cat_features(df: pl.LazyFrame, cat_cols: List) -> Tuple[pl.DataFrame, OrdinalEncoder]:
    """
    Function for enconding categorial features in table
    and replacing values in original table
    """

    # Fit encoder
    encoder = OrdinalEncoder(dtype=np.int64)
    encoder.set_output(transform="polars")
    encoder.fit(df.select(cat_cols).collect())

    # Modify DataFrame
    return (
        pl.concat(
            [
                df.drop(cat_cols).collect(),
                encoder.transform(df.select(cat_cols).collect()),
            ],
            how="horizontal",
        ),
        encoder,
    )

In [6]:
items_cat_cols = [
    "brand",
    "color",
    "closure",
    "country",
    "cut",
    "height",
    "length",
    "material",
    "model",
    "neckline",
    "pattern",
    "pocket",
    "purpose",
    "sleeve",
]

In [7]:
def encode_items_data(
    items_path: str,
    save_items_path: str,
    save_enc_path: Optional[str] = None,
):
    
    df_items = pl.scan_parquet(items_path)
    df_items, items_can_enc = encode_cat_features(df_items, items_cat_cols)
    df_items.write_parquet(save_items_path)

    if save_enc_path:
        with open(save_enc_path, "wb") as f:
            dill.dump(items_can_enc, f)

In [8]:
encode_items_data(
    items_path=data_path + "df_items.parquet",
    save_items_path=data_path + "df_items_mod.parquet",
    save_enc_path=models_path + "df_items_mod_encoder.dill"
)

In [9]:
pl.scan_parquet(data_path + "df_items_mod.parquet").schema

Schema([('item_id', Int64),
        ('title_len', UInt32),
        ('descr_len', UInt32),
        ('title_word_len', UInt32),
        ('descr_word_len', UInt32),
        ('txt_emb_pca_0', Float64),
        ('txt_emb_pca_1', Float64),
        ('txt_emb_pca_2', Float64),
        ('txt_emb_pca_3', Float64),
        ('txt_emb_pca_4', Float64),
        ('txt_emb_pca_5', Float64),
        ('txt_emb_pca_6', Float64),
        ('txt_emb_pca_7', Float64),
        ('txt_emb_pca_8', Float64),
        ('txt_emb_pca_9', Float64),
        ('img_emb_pca_0', Float64),
        ('img_emb_pca_1', Float64),
        ('img_emb_pca_2', Float64),
        ('img_emb_pca_3', Float64),
        ('img_emb_pca_4', Float64),
        ('img_emb_pca_5', Float64),
        ('img_emb_pca_6', Float64),
        ('img_emb_pca_7', Float64),
        ('img_emb_pca_8', Float64),
        ('img_emb_pca_9', Float64),
        ('brand', Int64),
        ('color', Int64),
        ('closure', Int64),
        ('country', Int64),
        ('

# Feature Engineering

In [10]:
def func_feature_engineering_with_interactions(
    df: pl.LazyFrame | pl.DataFrame,
) -> pl.LazyFrame | pl.DataFrame:
    """
    Function for calculating new features
    """
    # переименуем для удобства
    df = df.rename(
        {
            "u_total_inter": "user_hist",
        }
    )
    # Получаем популярность контента
    df = df.with_columns(pl.col("user_id").count().over("item_id").alias("item_pop"))

    # Получаем среднюю популярность контента, просматриваемого этим юзером
    df = df.with_columns(
        pl.col("item_pop").mean().over("user_id").alias("user_avg_pop")
    )

    # Получаем среднюю длину истории пользователя, которые смотрит этот контент
    df = df.with_columns(
        pl.col("user_hist").mean().over("item_id").alias("item_avg_hist")
    )

    # Получаем популярность последнего просмотренного контента
    df = df.sort(
        by=["user_id", "dt"],
        descending=[False, True],
    )
    df = df.with_columns(
        pl.col("item_pop").first().over("user_id").alias("user_last_pop")
    )

    return df

In [11]:
def get_tables_with_users_and_items_features(
    interactions_path: str,
    users_path: Optional[str] = None,
    items_path: Optional[str] = None,
    save_users_path: Optional[str] = None,
    save_items_path: Optional[str] = None,
):
    interactions = pl.scan_parquet(interactions_path)

    if users_path:
        df_users = pl.scan_parquet(users_path)
    elif save_users_path:
        df_users = interactions.select("user_id").unique()
    else:
        raise "users_path or save_users_path should be passed to funciton"

    if items_path:
        df_items = pl.scan_parquet(items_path)
    elif save_items_path:
        df_items = interactions.select("item_id").unique()
    else:
        raise "users_path or save_users_path should be passed to funciton"

    # Добавляем новые фичи в соответствующие таблицы
    df_items.join(
        other=func_feature_engineering_with_interactions(interactions)
        .select(["item_id", "item_pop", "item_avg_hist"])
        .unique(),
        how="left",
        on="item_id",
    ).fill_null(0).collect().write_parquet(save_items_path)

    # Создаем таблицу с фитчами пользователей
    df_users.join(
        other=func_feature_engineering_with_interactions(interactions)
        .select(["user_id", "user_hist", "user_avg_pop", "user_last_pop"])
        .unique(),
        how="left",
        on="user_id",
    ).fill_null(0).collect().write_parquet(save_users_path)

In [12]:
get_tables_with_users_and_items_features(
    interactions_path=data_path + "base_models_data.parquet",
    users_path=None,
    items_path=data_path + "df_items_mod.parquet",
    save_users_path=data_path + "df_users.parquet",
    save_items_path=data_path + "df_items_mod.parquet",
)

## Load data

In [14]:
ranker_data = pl.scan_parquet(data_path + "ranker_data.parquet")
candidates_full = pl.scan_parquet(candidates_data_path + "candidates_full.parquet")

In [15]:
ranker_data.schema, candidates_full.schema

(Schema([('user_id', Int64),
         ('item_id', Int64),
         ('dt', Datetime(time_unit='ns', time_zone=None)),
         ('ui_inter', UInt32),
         ('u_total_inter', UInt32),
         ('weight', Float64),
         ('ui_entry', Int64),
         ('cum_weight', Float64)]),
 Schema([('user_id', Int64),
         ('item_id', Int64),
         ('cos_score', Float32),
         ('cos_rank', Int64),
         ('bm25_score', Float32),
         ('bm25_rank', Int64),
         ('tfidf_score', Float32),
         ('tfidf_rank', Int64),
         ('lfm_score', Float32),
         ('lfm_rank', Int64)]))

In [16]:
# Пользователи, которым надо выдавать пресказания для обучения ранкера,
# т.е. присутствуют и в base_models_data и в ranker_data (base to ranker users)
with open(data_path + "b2r_users.dill", "rb") as f:
    b2r_users = dill.load(f)


# Пользователи из test_df, которым будут выданы
# таргетирвонные рекомондации
with open(data_path + "bNr2t_users.dill", "rb") as f:
    bNr2t_users = dill.load(f)


In [17]:
candidates_list = ["cos", "bm25", "tfidf", "lfm"]
default_values_candidates = {}
for cand in candidates_list:
    default_values_candidates[f"{cand}_score"] = (
        candidates_full.select(f"{cand}_score").min().collect().item()
    )
    default_values_candidates[f"{cand}_rank"] = (
        candidates_full.select(f"{cand}_rank").max().collect().item()
    )

In [18]:
default_values_candidates

{'cos_score': -0.009999113157391548,
 'cos_rank': 15,
 'bm25_score': -0.009771198034286499,
 'bm25_rank': 15,
 'tfidf_score': -0.009981262497603893,
 'tfidf_rank': 15,
 'lfm_score': -5.932222843170166,
 'lfm_rank': 15}

# Модель второго уровня (ранкер)

## Ranker Data

### Remove unnecessary

In [19]:
# Оставим только необходимые параметры из таблицы

# Ранкер будем обучать на пользователях у кого длинная история взаимодействий
ranker_data = ranker_data.filter(pl.col("u_total_inter") > 75).select(
    [
        "user_id",
        "item_id",
        # Так как бьем данные для tain val не по времени,
        # колонка "dt" не нужна
        # --------------------------
        # Потом будем использовать для ранкера чтобы задать таргет
        # (количество взаимодействий с предметом)
        "ui_inter",
        # --------------------------
        # Веса
        # "weight",
        # "cum_weight",
        # "rel_weight"
        # Убираем, т.к. они были получены из схожих соображений
        # и зависят от +- одинаковых фитчей
        # А на "rel_weight" обучалась модель первого уровня
        # так что далее он не нужен
        # --------------------------
        # Остальные колонки не нужны
        # Так как они были использованы для вывода весовых колонок,
        # либо присутствуют в фитчах пользователя или айтема
    ]
)

In [20]:
ranker_data

### Train \ Val \ Test Split

In [21]:
# Теперь ranker_data разбиваем по юзерам
# на train и val для обучения и валидации ранкера
train_size = 0.8
val_size = 0.2


ranker_train_users, ranker_val_users = train_test_split(
    ranker_data.select("user_id")
    .filter(pl.col("user_id").is_in(b2r_users))
    .collect()
    .to_numpy()
    .flatten(),
    random_state=RANDOM_STATE,
    test_size=val_size,
)

# test-выборка у нас уже имеется
# выборка пользователей присутствующих в base & ranker & test
# на них и будем проводить первичный тест системы
ranker_test_users = bNr2t_users

In [22]:
# Оставляем среди users только тех, для кого есть
# и рекомендации и таргеты
def users_filter(
    user_list: np.ndarray,
    candidates_df: pl.LazyFrame,
    df: pl.LazyFrame,
) -> pl.DataFrame:
    """
    Filters user interaction data and candidate recommendations,
    ensuring each user has both interactions and recommendations.

    Args:
        user_list (np.ndarray): User IDs to include.
        candidates_df (pl.LazyFrame): Candidate item recommendations
            with ranks ('cos_rank', 'bm25_rank', 'lfm_rank', 'tfidf_rank').
        df (pl.LazyFrame): User-item interactions ('user_id', 'item_id', 'dt',
            and potentially other weight-based columns).

    Returns:
        pl.LazyFrame: Filtered and merged DataFrame with user interactions
            and candidate items sorted and with missing values filled.
            It also filters down to items with at least one rank < 15
    """
    # For fillna
    default_values = {
        "ui_inter": 0,
        **default_values_candidates,
    }

    # Get valid interactions
    df = df.filter(pl.col("user_id").is_in(user_list))
    candidates_df = candidates_df.filter(pl.col("user_id").is_in(user_list))

    # join interaction на наших кандидатов для users из train, val, test
    df = (
        df.join(
            other=candidates_df,
            how="outer",
            on=["user_id", "item_id"],
        )
        .with_columns(
            pl.col("user_id").fill_null(pl.col("user_id_right")),
            pl.col("item_id").fill_null(pl.col("item_id_right")),
        )
        .drop(["user_id_right", "item_id_right"])
    )
    df = df.collect().with_columns(
        (
            pl.col(col_name).fill_null(default_values[col_name])
            for col_name in default_values.keys()
        )
    )

    # Сортируем по user_id
    df = df.sort(by=["user_id", "item_id"])
    df = df.filter(
        (pl.col("cos_rank") < 15)
        | (pl.col("bm25_rank") < 15)
        | (pl.col("lfm_rank") < 15)
        | (pl.col("tfidf_rank") < 15)
    )

    return df

In [23]:
users_filter(ranker_train_users, candidates_full, ranker_data).write_parquet(
    data_path + "ranker_train.parquet"
)
users_filter(ranker_val_users, candidates_full, ranker_data).write_parquet(
    data_path + "ranker_val.parquet"
)
users_filter(ranker_test_users, candidates_full, ranker_data).write_parquet(
    data_path + "ranker_test.parquet"
)

In [24]:
pl.scan_parquet(data_path + "ranker_train.parquet").schema

Schema([('user_id', Int64),
        ('item_id', Int64),
        ('ui_inter', UInt32),
        ('cos_score', Float32),
        ('cos_rank', Int64),
        ('bm25_score', Float32),
        ('bm25_rank', Int64),
        ('tfidf_score', Float32),
        ('tfidf_rank', Int64),
        ('lfm_score', Float32),
        ('lfm_rank', Int64)])

## Добавим фитчи предметов и пользователей 

### Пользователей

In [25]:
# Загружаем таблицу фитчей пользователей
df_users = pl.scan_parquet(data_path + "df_users.parquet")


# Для новых фичей юзеров
default_values_users = {
    "user_hist": 0,
    "user_avg_pop": df_users.select("user_avg_pop").median().collect().item(),
    "user_last_pop": df_users.select("user_last_pop").median().collect().item(),
}

In [26]:
# Добавляем фичи
def add_users_features(
    df: pl.LazyFrame,
    users: pl.LazyFrame,
) -> pl.DataFrame:
    """
    Merges user and item features into a DataFrame, handling missing values.

    Args:
        df (pl.LazyFrame): Interaction DataFrame ('user_id', 'item_id').
        users (pl.LazyFrame): User features DataFrame ('user_id').

    Returns:
        pl.DataFrame: DataFrame with merged user and item features,
            and missing values filled.
    """

    df = df.join(
        other=users.filter(
            pl.col("user_id").is_in(df.select("user_id").unique().collect())
        ),
        how="left",
        on=["user_id"],
    )

    # При джойне могут получиться строки
    # с несуществующими айтемами или юзерами.
    # Заполняем пропуски
    return df.collect().with_columns(
        (
            pl.col(col_name).fill_null(default_values_users[col_name])
            for col_name in default_values_users.keys()
        )
    )

In [27]:
add_users_features(
    pl.scan_parquet(data_path + "ranker_train.parquet"), df_users
).write_parquet(data_path + "ranker_train.parquet")

add_users_features(
    pl.scan_parquet(data_path + "ranker_val.parquet"), df_users
).write_parquet(data_path + "ranker_val.parquet")

add_users_features(
    pl.scan_parquet(data_path + "ranker_test.parquet"), df_users
).write_parquet(data_path + "ranker_test.parquet")

In [28]:
pl.scan_parquet(data_path + "ranker_train.parquet").schema


Schema([('user_id', Int64),
        ('item_id', Int64),
        ('ui_inter', UInt32),
        ('cos_score', Float32),
        ('cos_rank', Int64),
        ('bm25_score', Float32),
        ('bm25_rank', Int64),
        ('tfidf_score', Float32),
        ('tfidf_rank', Int64),
        ('lfm_score', Float32),
        ('lfm_rank', Int64),
        ('user_hist', UInt32),
        ('user_avg_pop', Float64),
        ('user_last_pop', Float64)])

### Предметов

In [29]:
# Загружаем таблицу айтемов
df_items = pl.scan_parquet(data_path + "df_items_mod.parquet")

# Для новых фичей айтемов
default_values_items = {
    "item_pop": df_items.select("item_pop").median().collect().item(),
    "item_avg_hist": df_items.select("item_avg_hist").median().collect().item(),
}

In [30]:
# Добавляем фичи
def add_items_features(
    df: pl.LazyFrame,
    items: pl.LazyFrame,
) -> pl.DataFrame:
    """
    Merges user and item features into a DataFrame, handling missing values.

    Args:
        df (pd.DataFrame): Interaction DataFrame ('user_id', 'item_id').
        items (pd.DataFrame): Item features DataFrame ('item_id').

    Returns:
        pd.DataFrame: DataFrame with merged user and item features,
            and missing values filled.
    """
    
    df = df.join(
        other=items.filter(
            pl.col("item_id").is_in(df.select("item_id").unique().collect())
        ),
        how="left",
        on=["item_id"],
    )

    # При джойне могут получиться строки
    # с несуществующими айтемами или юзерами.
    # Заполняем пропуски
    return df.collect().with_columns(
        (
            pl.col(col_name).fill_null(default_values_items[col_name])
            for col_name in default_values_items.keys()
        )
    )

In [31]:
add_items_features(
    pl.scan_parquet(data_path + "ranker_train.parquet"), df_items
).write_parquet(data_path + "ranker_train.parquet")

add_items_features(
    pl.scan_parquet(data_path + "ranker_val.parquet"), df_items
).write_parquet(data_path + "ranker_val.parquet")

add_items_features(
    pl.scan_parquet(data_path + "ranker_test.parquet"), df_items
).write_parquet(data_path + "ranker_test.parquet")

## Добавим таргет

In [32]:
def add_target(df: pl.LazyFrame) -> pl.DataFrame:

    return df.with_columns(
        pl.when(pl.col("ui_inter") > 6)
        .then(10)
        .when(pl.col("ui_inter") > 4)
        .then(8)
        .when(pl.col("ui_inter") > 2)
        .then(4)
        .when(pl.col("ui_inter") > 1)
        .then(2)
        .otherwise(1)
        .alias("target")
    ).collect()

In [33]:
add_target(pl.scan_parquet(data_path + "ranker_train.parquet")).write_parquet(
    data_path + "ranker_train.parquet"
)
add_target(pl.scan_parquet(data_path + "ranker_val.parquet")).write_parquet(
    data_path + "ranker_val.parquet"
)
add_target(pl.scan_parquet(data_path + "ranker_test.parquet")).write_parquet(
    data_path + "ranker_test.parquet"
)


|Stage|Description|Time (s)|
|---|---|---|
|Item Data Encoding|Encoding categorical features in the items data|4|
|Feature Engineering|Creating new features|17.4|
|Train/Validation Split|Splitting the data into training and validation sets|1|
|Users Filter (Merge Interactions)|Applying a filter on users (merging with candidates table)|13.6|
|Merging with Users’ Features|Merging with a table containing user features|2.8|
|Merging with Items’ Features|Merging with a table containing item features|17.9|
|Adding Target Variable|Adding the target variable (likely a dependent variable)|17.2|
|**Total Preprocessing Time**|**Total time for all preprocessing steps**|**74.0**|
