In [30]:
%autosave 60
%reload_ext autoreload
%autoreload 2

import sys
import gc

sys.path.insert(0, "..")  # noqa: E702

Autosaving every 60 seconds


In [3]:
from catboost.utils import get_gpu_device_count

get_gpu_device_count()

1

In [None]:
import implicit.gpu
import polars as pl
import numpy as np
import implicit
from scipy.sparse import csr_matrix
from pathlib import Path
from catboost import CatBoostRanker, Pool
from datetime import timedelta
from hydra import compose, initialize
from tqdm import tqdm


implicit.gpu.HAS_CUDA
from tools.retrievers import PopularItemsRetriever, CFRetriever
from tools.experiment_tracker import ExperimentTracker

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_configs(config_path, config_name):
    with initialize(config_path=config_path, version_base=None):
        cfg = compose(config_name=config_name)

        return cfg

In [6]:
configs = load_configs("../configs", "config")

In [7]:
# experiment_tracker = ExperimentTracker(**configs.clearml, config=configs)

In [8]:
# experiment_tracker.close()

In [9]:
from pathlib import Path

DATA_DIR = Path.cwd().parent / "data" / "avito_ml_cup"
# Разделение на три части: тренировочную, валидационную и тестовую
VALID_DAYS_RETRIEVER = configs.validation.days_retriever
VALID_DAYS_RERANKER = configs.validation.days_reranker

In [10]:
df_test_users = pl.read_parquet(f"{DATA_DIR}/test_users.pq")
df_clickstream = pl.read_parquet(f"{DATA_DIR}/clickstream.pq")

df_cat_features = pl.read_parquet(f"{DATA_DIR}/cat_features.pq")
# df_text_features = pl.read_parquet(f"{DATA_DIR}/text_features.pq")
df_event = pl.read_parquet(f"{DATA_DIR}/events.pq")

In [11]:
# Определяем границы для разделения данных
max_date = df_clickstream["event_date"].max()
retrivial_threshold = max_date - timedelta(days=VALID_DAYS_RETRIEVER + VALID_DAYS_RERANKER)
reranker_threshold = max_date - timedelta(days=VALID_DAYS_RERANKER)

# Разделяем данные
df_train_retrivial = df_clickstream.filter(pl.col("event_date") < retrivial_threshold)
df_train_reranker = df_clickstream.filter(
    pl.col("event_date").is_between(retrivial_threshold, reranker_threshold)
)
df_valid = df_clickstream.filter(pl.col("event_date") > reranker_threshold)


# Создаем оценочные наборы для валидации и тестирования
def prepare_eval_set(df_eval_raw, df_train_data):
    # Выбираем только нужные колонки
    df_eval = df_eval_raw[["cookie", "node", "event"]]

    # Исключаем пары cookie-node, которые уже встречались в тренировочных данных
    df_eval = df_eval.join(df_train_data, on=["cookie", "node"], how="anti")

    # Оставляем только контактные события
    df_eval = df_eval.filter(
        pl.col("event").is_in(
            df_event.filter(pl.col("is_contact") == 1)["event"].unique()
        )
    )

    # Оставляем только существующие в тренировочных данных cookies и nodes
    df_eval = df_eval.filter(
        pl.col("cookie").is_in(df_train_data["cookie"].unique())
    ).filter(pl.col("node").is_in(df_train_data["node"].unique()))

    # Оставляем только уникальные пары cookie-node
    df_eval = df_eval.unique(["cookie", "node"])

    return df_eval


# Подготавливаем валидационный и тестовый наборы
df_valid_eval_retrivial = prepare_eval_set(df_train_reranker, df_train_retrivial)
df_valid_eval_reranker = prepare_eval_set(
    df_valid, pl.concat([df_train_retrivial, df_train_reranker])
)

df_train_retrivial = df_train_retrivial.join(df_event, on="event")

df_train_reranker = df_train_reranker.join(df_event, on="event")

df_valid = df_valid.join(df_event, on="event")

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  df_eval = df_eval.filter(
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  df_eval = df_eval.filter(
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  ).filter(pl.col("node").is_in(df_train_data["node"].unique()))


In [12]:
# Выводим информацию о размерах наборов данных
print(f"Размер тренировочного набора retrivial: {df_train_retrivial.shape}")
print(f"Размер валидационного набора reranker: {df_train_reranker.shape}")
print(f"Размер валидационного набора: {df_valid.shape}")
print(
    f"Размер валидационного набора для оценки retrivial: {df_valid_eval_retrivial.shape}"
)
print(
    f"Размер валидационного набора для оценки reranker: {df_valid_eval_reranker.shape}"
)

Размер тренировочного набора retrivial: (45631757, 8)
Размер валидационного набора reranker: (11400304, 8)
Размер валидационного набора: (11774091, 8)
Размер валидационного набора для оценки retrivial: (72271, 3)
Размер валидационного набора для оценки reranker: (74771, 3)


In [13]:
eval_users = df_valid_eval_retrivial["cookie"].unique().to_list()

In [14]:
def recall_at(df_true, df_pred, k=40):
    """Вычисляет метрику Recall@K для рекомендаций"""
    return (
        df_true[["node", "cookie"]]
        .join(
            df_pred.group_by("cookie")
            .head(k)
            .with_columns(value=1)[["node", "cookie", "value"]],
            how="left",
            on=["cookie", "node"],
        )
        .select([pl.col("value").fill_null(0), "cookie"])
        .group_by("cookie")
        .agg([pl.col("value").sum() / pl.col("value").count()])["value"]
        .mean()
    )


def evaluate_models(
    df_true, df_candidates_pred, df_reranker_pred=None, k_values=[10, 20, 30, 40]
):
    """Сравнивает модели по метрике Recall@K для разных значений K"""
    results = {}

    for k in k_values:
        results[f"CF_recall@{k}"] = recall_at(df_true, df_candidates_pred, k=k)
        if df_reranker_pred is not None:
            results[f"Reranker_recall@{k}"] = recall_at(df_true, df_reranker_pred, k=k)

    return results

# ALS

In [15]:
als_retriever = CFRetriever(implicit.als.AlternatingLeastSquares, iterations=30, factors=100, regularization=0.7, use_gpu=True, random_state=123)

In [16]:
def clean_interactions_for_als(df_clickstream):
    """
    Комплексная очистка для ALS
    """
    # 1. Агрегируем по cookie-node с приоритетом контактных событий
    cleaned = df_clickstream.group_by(["cookie", "node", "item"]).agg([
        pl.col("is_contact").max().alias("is_contact"),  # Если хоть раз был контакт
        pl.col("event_date").max().alias("last_interaction"),
        pl.count().alias("total_interactions"),
        pl.col("is_contact").sum().alias("contact_interactions")
    ]).with_columns([
        # Создаем финальный вес для ALS
        pl.when(pl.col("is_contact") == 1)
        .then(5.0 + pl.col("contact_interactions").log1p())  # Больше контактов = больше уверенности
        .otherwise(0.5 + pl.col("total_interactions").log1p() * 0.1)  # Неконтактные получают малый вес
        .alias("als_weight")
    ])
    
    return cleaned.select(["cookie", "node", "is_contact", "als_weight"])


df_clean_retrivial = clean_interactions_for_als(df_train_retrivial)

  pl.count().alias("total_interactions"),


In [17]:
users = df_clean_retrivial["cookie"]
nodes = df_clean_retrivial["node"]
weights_for_als = df_clean_retrivial["als_weight"]

del df_clean_retrivial

In [18]:
als_retriever.fit(users, nodes, is_contact_col=weights_for_als)
df_pred_alt = als_retriever.recommend(eval_users, 300)

del users, nodes, weights_for_als
# als_retriever

100%|██████████| 30/30 [00:52<00:00,  1.74s/it]


In [19]:
als_results = evaluate_models(
    df_valid_eval_retrivial, df_pred_alt, k_values=[10, 20, 30, 40]
)
print("\nРезультаты ALS модели на валидационном наборе:")
for metric, value in als_results.items():
    print(f"{metric}: {value:.4f}")


Результаты ALS модели на валидационном наборе:
CF_recall@10: 0.0666
CF_recall@20: 0.1029
CF_recall@30: 0.1312
CF_recall@40: 0.1540


# BPR

In [20]:
bpr_retriever = CFRetriever(implicit.bpr.BayesianPersonalizedRanking, iterations=50, factors=64, regularization=0.001, learning_rate=0.01, use_gpu=True, random_state=123)

In [21]:
def prepare_temporal_bpr_data(df_clickstream, train_days_back=30):
    """
    Подготавливает данные с учетом временной динамики для BPR
    """
    # Сортируем по времени
    df_sorted = df_clickstream.sort("event_date")
    
    # Берем только последние train_days_back дней для обучения
    max_date = df_sorted["event_date"].max()
    train_start = max_date - timedelta(days=train_days_back)
    
    df_recent = df_sorted.filter(pl.col("event_date") >= train_start)
    
    # Для BPR лучше убрать дубликаты пользователь-товар, оставив последнее взаимодействие
    df_deduplicated = df_recent.sort("event_date").group_by(
        ["cookie", "node", "item"]
    ).agg(
        pl.col("is_contact").max()
    )

    df_deduplicated = df_deduplicated.group_by(
        ["cookie", "node"]
    ).last()
    
    return df_deduplicated

df_temporal_bpr = prepare_temporal_bpr_data(df_train_retrivial, train_days_back=21)

# def prepare_improved_bpr_data(df_clickstream, min_interactions=3):
#     """
#     Улучшенная подготовка данных для BPR
#     """
#     # Фильтруем активных пользователей и популярные товары
#     active_users = df_clickstream.group_by("cookie").len().filter(
#         pl.col("len") >= min_interactions
#     )["cookie"]
    
#     popular_items = df_clickstream.group_by("node").len().filter(
#         pl.col("len") >= 2  # минимум 5 взаимодействий
#     )["node"]
    
#     df_filtered = df_clickstream.filter(
#         pl.col("cookie").is_in(active_users) &
#         pl.col("node").is_in(popular_items)
#     )
    
#     # Добавляем временные веса (более свежие взаимодействия важнее)
#     max_date = df_filtered["event_date"].max()
#     df_filtered = df_filtered.with_columns([
#         ((max_date - pl.col("event_date")).dt.total_days() / 30.0).alias("days_ago"),
#     ]).with_columns([
#         # Временной вес: более свежие = больше вес
#         pl.when(pl.col("is_contact") == 1)
#         .then(2.0 * (-pl.col("days_ago") * 0.1).exp())  # Контакты с временным весом
#         .otherwise(0.5 * (-pl.col("days_ago") * 0.05).exp())  # Просмотры с меньшим весом
#         .alias("temporal_weight")
#     ])
    
#     return df_filtered

# df_temporal_bpr = prepare_improved_bpr_data(df_train_retrivial)

In [22]:
users = df_temporal_bpr["cookie"]
nodes = df_temporal_bpr["node"]
contact_cols = df_temporal_bpr["is_contact"]

del df_temporal_bpr

In [23]:
bpr_retriever.fit(users, nodes, is_contact_col=contact_cols)

100%|██████████| 50/50 [00:23<00:00,  2.15it/s, train_auc=94.72%, skipped=4.23%]


In [24]:
df_pred_bpr = bpr_retriever.recommend(eval_users, 300)

del users, nodes, contact_cols
# bpr_retriever

In [25]:
bpr_results = evaluate_models(
    df_valid_eval_retrivial, df_pred_bpr, k_values=[10, 20, 30, 40]
)
print("\nРезультаты BPR модели на валидационном наборе:")
for metric, value in bpr_results.items():
    print(f"{metric}: {value:.4f}")


Результаты BPR модели на валидационном наборе:
CF_recall@10: 0.0387
CF_recall@20: 0.0582
CF_recall@30: 0.0737
CF_recall@40: 0.0858


In [26]:
# df_pred_alt = df_pred_alt.sort(["cookie", "scores"], descending=[False, True]).with_columns(
#     pl.int_range(pl.len()).over("cookie").alias("rank_als")
# )

In [27]:
# df_pred_bpr = df_pred_bpr.sort(["cookie", "scores"], descending=[False, True]).with_columns(
#     pl.int_range(pl.len()).over("cookie").alias("rank_bpr")
# )

In [28]:
cadidates_df = df_pred_alt.rename({"scores": "als_score"}).join(
    df_pred_bpr.rename({"scores": "bpr_score"}),
    on=["node", "cookie"],
    how="full",
    coalesce=True)
# ).with_columns([
#     # Добавляем флаги наличия рекомендаций
#     pl.col("als_score").is_not_null().alias("has_als_rec"),
#     pl.col("bpr_score").is_not_null().alias("has_bpr_rec"),
    
#     pl.col("als_score").fill_null(pl.col("als_score").min()),
#     pl.col("bpr_score").fill_null(pl.col("bpr_score").min())
# ]).with_columns([
#     # Считаем количество моделей, рекомендовавших товар
#     (pl.col("has_als_rec").cast(pl.Int32) + 
#      pl.col("has_bpr_rec").cast(pl.Int32)).alias("recommendation_count")
# ])

print(cadidates_df.shape)
cadidates_df.head()

(17628856, 4)


node,cookie,als_score,bpr_score
i64,i64,f64,f64
115834,0,0.844753,4.974972
214235,0,0.75322,
1923,0,0.719228,4.060912
214234,0,0.688234,
115713,0,0.676143,3.813639


In [29]:
def get_missing_scores(df_candidates, model, model_name="als"):
    """Получает скоры для кандидатов, которых нет в рекомендациях модели"""
    
    # Находим пары (user, item) без скоров
    missing_scores = df_candidates.filter(
        pl.col(f"{model_name}_score").is_null()
    )
    
    if len(missing_scores) == 0:
        return df_candidates
    
    # Получаем скоры напрямую от модели
    users = missing_scores["cookie"].unique().to_list()

    df_pred = pl.DataFrame(schema={"cookie": pl.Int64, "node": pl.Int64, f"{model_name}_score": pl.Float64})
    for user_id in tqdm(users):
        if user_id in model._user_id_to_index:
            user_idx = model._user_id_to_index[user_id]
            user_items = missing_scores.filter(pl.col("cookie") == user_id)["node"].to_list()

            item_idx = [model._item_id_to_index[i] for i in user_items]
            reccommends, scores = model.model.recommend(user_idx, model._sparse_matrix[user_idx], N=len(item_idx), filter_already_liked_items=False, items=item_idx)
            node_list = [model._index_to_item_id[i] for i in reccommends]

            df_pred = pl.concat([df_pred, 
                                pl.DataFrame({"cookie": user_id, "node": node_list, f"{model_name}_score": scores.tolist()},
                                             schema={"cookie": pl.Int64, "node": pl.Int64, f"{model_name}_score": pl.Float64})])
            
    df_candidates = df_candidates.join(
        df_pred,
        on=["cookie", "node"],
        how="left",
        suffix="_new"
    ).with_columns(
        pl.coalesce([
            pl.col(f"{model_name}_score"),
            pl.col(f"{model_name}_score_new")
        ]).alias(f"{model_name}_score")
    ).drop(f"{model_name}_score_new")
    
    return df_candidates

In [None]:
# Использование
df_candidates_with_scores = get_missing_scores(cadidates_df, als_retriever, "als")
df_candidates_with_scores = get_missing_scores(df_candidates_with_scores, bpr_retriever, "bpr")

del als_retriever, bpr_retriever, cadidates_df
gc.collect()

100%|██████████| 33463/33463 [07:29<00:00, 74.48it/s]
100%|██████████| 33758/33758 [07:34<00:00, 74.35it/s]


7

# Catboost

In [None]:
df_candidates_train = df_candidates_with_scores.join(
    df_train_reranker["node", "cookie", "is_contact"]
    .sort("is_contact", descending=True)
    .unique(["cookie", "node"]),
    on=["cookie", "node"],
    how="left",
).fill_null(0)

del df_candidates_with_scores

In [None]:
def improved_frequency_based_negative_sampling(df_candidates, df_reranker_data, negative_ratio=3):
    """
    Улучшенная версия с корректным определением позитивных примеров
    """
    # 1. ТОЛЬКО контактные события как позитивные примеры
    df_positive = df_candidates.join(
        df_reranker_data.filter(pl.col("is_contact") == 1)  # ТОЛЬКО контакты!
        [["cookie", "node"]]
        .unique(["cookie", "node"]),
        on=["cookie", "node"],
        how="inner"
    ).with_columns(pl.lit(1).alias("is_contact"))  # Явно помечаем как позитивные
    
    print(f"Позитивных примеров (только контакты): {df_positive.shape[0]}")
    print(df_positive.columns)
    
    # 2. Все остальные кандидаты - негативные
    df_negative_pool = df_candidates.join(
        df_positive[["cookie", "node"]],
        on=["cookie", "node"],
        how="anti"
    ).with_columns(pl.lit(0).alias("is_contact"))
    
    # 3. Добавляем частоты для взвешенного сэмплирования
    node_frequencies = df_reranker_data.group_by("node").len().rename({"len": "frequency"})
    df_negative_pool = df_negative_pool.join(node_frequencies, on="node", how="left").with_columns(
        pl.col("frequency").fill_null(1)
    )
    
    negative_samples = []
    
    for user in tqdm(df_positive["cookie"].unique()):
        user_positives_count = len(df_positive.filter(pl.col("cookie") == user))
        needed_negatives = user_positives_count * negative_ratio
        
        user_negative_pool = df_negative_pool.filter(pl.col("cookie") == user)
        
        if len(user_negative_pool) == 0:
            continue
            
        # Взвешенное сэмплирование
        user_negative_pool = user_negative_pool.with_columns([
            (pl.col("frequency").log1p() * 0.3 +
             pl.col("als_score").rank(descending=True) * 0.5 +
             pl.col("bpr_score").rank(descending=True) * 0.2
            ).alias("sampling_weight")
        ])
        
        if len(user_negative_pool) <= needed_negatives:
            user_negatives = user_negative_pool
        else:
            total_weight = user_negative_pool["sampling_weight"].sum()
            sampling_probs = (user_negative_pool["sampling_weight"] / total_weight).to_list()
            
            selected_indices = np.random.choice(
                len(user_negative_pool),
                size=needed_negatives,
                replace=False,
                p=sampling_probs
            )
            
            user_negatives = user_negative_pool[selected_indices]
        
        negative_samples.append(user_negatives.drop(["frequency", "sampling_weight"]))
    
    if negative_samples:
        df_negatives = pl.concat(negative_samples)
        print(df_negatives.columns)
        df_result = pl.concat([df_positive, df_negatives])
    else:
        df_result = df_positive
    
    print(f"Негативных примеров: {len(df_negatives) if negative_samples else 0}")
    print(f"Итоговый размер: {df_result.shape[0]}")
    print(f"Доля позитивных: {df_positive.shape[0] / df_result.shape[0]:.3f}")
    
    return df_result

# Используем улучшенную версию
df_candidates_train_improved = improved_frequency_based_negative_sampling(
    df_candidates_train,
    df_train_reranker,
    negative_ratio=5
)

del df_candidates_train

Позитивных примеров (только контакты): 30494
['node', 'cookie', 'als_score', 'bpr_score', 'is_contact']


100%|██████████| 19151/19151 [03:47<00:00, 84.03it/s]


['node', 'cookie', 'als_score', 'bpr_score', 'is_contact']
Негативных примеров: 152470
Итоговый размер: 182964
Доля позитивных: 0.167


In [35]:
df_candidates_train_improved

node,cookie,als_score,bpr_score,is_contact
i64,i64,f64,f64,i32
214261,1,0.312046,1.444052,1
151577,2,0.851258,3.826634,1
214377,3,0.501293,4.378954,1
194770,7,0.039922,2.252387,1
214239,16,0.01188,0.486548,1
…,…,…,…,…
261641,149998,0.013189,3.145392,0
170555,149998,0.068307,0.169966,0
152950,149998,0.018514,3.475289,0
115826,149998,0.065343,-0.71774,0


In [None]:
def prepare_features_for_boosting(
    df_retrievals,
    df_clickstream,
    df_cat_features,
    **kwargs
):
    """Подготавливает признаки для обучения бустинга на основе результатов ALS"""
    # Добавляем информацию о рейтинге ALS
    features = df_retrievals.join(
        df_cat_features.select(["node", "category"]).unique(),
        on="node",
        how="left",
    ).fill_null(-1)


    max_date = df_clickstream["event_date"].max()
    # === ПОЛЬЗОВАТЕЛЬСКИЕ ПРИЗНАКИ ===
    user_features = df_clickstream.group_by("cookie").agg([
        pl.len().alias("user_total_interactions"),
        pl.n_unique("node").alias("user_unique_nodes"),
        pl.n_unique("event").alias("user_unique_events"),
        pl.col("is_contact").sum().alias("user_total_contacts"),
        (pl.col("is_contact").sum() / pl.len()).alias("user_contact_ratio"),
        pl.col("event_date").min().alias("user_first_interaction"),
        pl.col("event_date").max().alias("user_last_interaction"),
    ]).with_columns([
        # ОТНОСИТЕЛЬНЫЕ временные признаки
        (pl.col("user_last_interaction") - pl.col("user_first_interaction")).dt.total_days().alias("user_activity_span_days"),
        (max_date - pl.col("user_last_interaction")).dt.total_days().alias("days_since_user_last_activity"),
        (max_date - pl.col("user_first_interaction")).dt.total_days().alias("user_account_age_days"),
        # Интенсивность активности
        # (pl.col("user_total_interactions") / 
        #  (pl.col("user_last_interaction") - pl.col("user_first_interaction")).dt.total_days().clip(1, None)
        # ).alias("user_activity_intensity")
    ]).drop(["user_first_interaction", "user_last_interaction"])

    features = features.join(user_features, on="cookie", how="left").fill_null(0)
    del user_features

    for days in [3, 7, 14, 30]:
        recent_threshold = max_date - timedelta(days=days)
        recent_user_activity = df_clickstream.filter(
            pl.col("event_date") > recent_threshold
        ).group_by("cookie").agg([
            pl.len().alias(f"user_interactions_last_{days}d"),
            pl.n_unique("node").alias(f"user_unique_nodes_last_{days}d"),
            pl.col("is_contact").sum().alias(f"user_contacts_last_{days}d"),
        ])
        features = features.join(recent_user_activity, on="cookie", how="left").fill_null(0)
        del recent_user_activity

    item_features = df_clickstream.group_by("node").agg([
        pl.len().alias("item_total_interactions"),
        pl.n_unique("cookie").alias("item_unique_users"),
        pl.col("is_contact").sum().alias("item_total_contacts"),
        (pl.col("is_contact").sum() / pl.len()).alias("item_contact_ratio"),
        pl.col("event_date").min().alias("item_first_seen"),
        pl.col("event_date").max().alias("item_last_seen"),
    ]).with_columns([
        # ОТНОСИТЕЛЬНЫЕ временные признаки
        (max_date - pl.col("item_first_seen")).dt.total_days().alias("item_age_days"),
        (max_date - pl.col("item_last_seen")).dt.total_days().alias("days_since_item_last_interaction"),
        # Активный период товара
        # (pl.col("item_last_seen") - pl.col("item_first_seen")).dt.total_days().alias("item_active_period_days")
    ]).drop(["item_first_seen", "item_last_seen"])  # Убираем абсолютные даты

    features = features.join(item_features, on="node", how="left").fill_null(0)
    del item_features

    category_features = df_clickstream.join(
        df_cat_features.select(["node", "category"]).unique(), on="node", how="left"
    ).group_by("category").agg([
        pl.len().alias("category_total_interactions"),
        pl.n_unique("node").alias("category_unique_items"),
        pl.n_unique("cookie").alias("category_unique_users"),
        pl.col("is_contact").sum().alias("category_total_contacts"),
        (pl.col("is_contact").sum() / pl.len()).alias("category_contact_ratio"),
    ])

    features = features.join(category_features, on=["category"], how="left").fill_null(-1)
    del category_features

    # === КРОСС-ПРИЗНАКИ ===
    features = features.with_columns([
        # Взаимодействие популярности товара и активности пользователя
        (pl.col("item_total_interactions").log1p() * pl.col("user_total_interactions").log1p()).alias("item_user_interaction_product"),
        # Соответствие предпочтений пользователя и товара
        (pl.col("user_contact_ratio") * pl.col("item_contact_ratio")).alias("user_item_contact_affinity"),
        # Новизна товара (бинарный признак)
        pl.when(pl.col("item_age_days") < 7).then(1).otherwise(0).alias("is_new_item"),
        # Активный ли пользователь в последнее время
        pl.when(pl.col("days_since_user_last_activity") < 3).then(1).otherwise(0).alias("is_recently_active_user"),
        # Популярный ли товар в категории
        (pl.col("item_total_interactions") / pl.col("category_total_interactions")).alias("item_category_popularity_ratio"),
        # Специализированность пользователя в категории
        # (pl.col("user_cat_interactions") / pl.col("user_total_interactions")).alias("user_category_specialization")
    ])

    return features

# Готовим данные для CatBoost
train_data_for_boosting = prepare_features_for_boosting(
    df_candidates_train_improved,
    df_train_retrivial,
    df_cat_features,
    # popular_count_week=popular_count_week,
    # popular_count_month=popular_count_month,
    # trending_count_week=trending_count_week,
)


del df_candidates_train_improved
print(train_data_for_boosting.shape)
train_data_for_boosting

(182964, 42)


node,cookie,als_score,bpr_score,is_contact,category,user_total_interactions,user_unique_nodes,user_unique_events,user_total_contacts,user_contact_ratio,user_activity_span_days,days_since_user_last_activity,user_account_age_days,user_interactions_last_3d,user_unique_nodes_last_3d,user_contacts_last_3d,user_interactions_last_7d,user_unique_nodes_last_7d,user_contacts_last_7d,user_interactions_last_14d,user_unique_nodes_last_14d,user_contacts_last_14d,user_interactions_last_30d,user_unique_nodes_last_30d,user_contacts_last_30d,item_total_interactions,item_unique_users,item_total_contacts,item_contact_ratio,item_age_days,days_since_item_last_interaction,category_total_interactions,category_unique_items,category_unique_users,category_total_contacts,category_contact_ratio,item_user_interaction_product,user_item_contact_affinity,is_new_item,is_recently_active_user,item_category_popularity_ratio
i64,i64,f64,f64,i32,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i32,i32,f64
214261,1,0.312046,1.444052,1,37,3266,911,10,53,0.016228,29,0,29,353,206,11,691,331,16,1243,513,24,3266,911,53,15495,4567,1734,0.111907,29,0,1873464,258,81384,120806,0.064483,78.07075,0.001816,0,1,0.008271
151577,2,0.851258,3.826634,1,40,903,159,8,26,0.028793,28,0,29,162,32,13,324,54,13,512,82,22,903,159,26,44427,7737,329,0.007405,29,0,5897889,2061,58039,70364,0.01193,72.844136,0.000213,0,1,0.007533
214377,3,0.501293,4.378954,1,37,537,65,2,23,0.042831,27,1,28,17,11,0,31,14,0,430,35,19,537,65,23,29913,3722,2199,0.073513,29,0,1873464,258,81384,120806,0.064483,64.803185,0.003149,0,1,0.015967
194770,7,0.039922,2.252387,1,57,96,4,4,2,0.020833,20,0,20,75,4,2,94,4,2,95,4,2,96,4,2,4652,2047,243,0.052236,29,0,2329176,8186,41469,152207,0.065348,38.634658,0.001088,0,1,0.001997
214239,16,0.01188,0.486548,1,37,3,2,1,0,0.0,2,2,5,2,1,0,3,2,0,3,2,0,3,2,0,110281,20389,4147,0.037604,29,0,1873464,258,81384,120806,0.064483,16.095981,0.0,0,1,0.058865
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
261641,149998,0.013189,3.145392,0,51,308,62,5,4,0.012987,27,1,29,5,5,0,20,9,1,110,25,2,308,62,4,435,264,11,0.025287,29,0,3946889,113371,52231,126682,0.032097,34.845197,0.000328,0,1,0.00011
170555,149998,0.068307,0.169966,0,52,308,62,5,4,0.012987,27,1,29,5,5,0,20,9,1,110,25,2,308,62,4,5751,2520,52,0.009042,29,0,1267216,2853,38770,30082,0.023739,49.635272,0.000117,0,1,0.004538
152950,149998,0.018514,3.475289,0,40,308,62,5,4,0.012987,27,1,29,5,5,0,20,9,1,110,25,2,308,62,4,269,159,3,0.011152,29,0,5897889,2061,58039,70364,0.01193,32.097664,0.000145,0,1,0.000046
115826,149998,0.065343,-0.71774,0,19,308,62,5,4,0.012987,27,1,29,5,5,0,20,9,1,110,25,2,308,62,4,25734,4581,448,0.017409,29,0,2173922,4150,54731,50632,0.023291,58.225562,0.000226,0,1,0.011838


In [37]:
train_data_for_boosting.columns

['node',
 'cookie',
 'als_score',
 'bpr_score',
 'is_contact',
 'category',
 'user_total_interactions',
 'user_unique_nodes',
 'user_unique_events',
 'user_total_contacts',
 'user_contact_ratio',
 'user_activity_span_days',
 'days_since_user_last_activity',
 'user_account_age_days',
 'user_interactions_last_3d',
 'user_unique_nodes_last_3d',
 'user_contacts_last_3d',
 'user_interactions_last_7d',
 'user_unique_nodes_last_7d',
 'user_contacts_last_7d',
 'user_interactions_last_14d',
 'user_unique_nodes_last_14d',
 'user_contacts_last_14d',
 'user_interactions_last_30d',
 'user_unique_nodes_last_30d',
 'user_contacts_last_30d',
 'item_total_interactions',
 'item_unique_users',
 'item_total_contacts',
 'item_contact_ratio',
 'item_age_days',
 'days_since_item_last_interaction',
 'category_total_interactions',
 'category_unique_items',
 'category_unique_users',
 'category_total_contacts',
 'category_contact_ratio',
 'item_user_interaction_product',
 'user_item_contact_affinity',
 'is_new_i

In [38]:
feature_cols = [
    'node',
    'als_score',
    'bpr_score',
    'category',
    'user_total_interactions',
    'user_unique_nodes',
    'user_unique_events',
    'user_total_contacts',
    'user_contact_ratio',
    'user_activity_span_days',
    'days_since_user_last_activity',
    'user_account_age_days',
    'user_interactions_last_3d',
    'user_unique_nodes_last_3d',
    'user_contacts_last_3d',
    'user_interactions_last_7d',
    'user_unique_nodes_last_7d',
    'user_contacts_last_7d',
    'user_interactions_last_14d',
    'user_unique_nodes_last_14d',
    'user_contacts_last_14d',
    'user_interactions_last_30d',
    'user_unique_nodes_last_30d',
    'user_contacts_last_30d',
    'item_total_interactions',
    'item_unique_users',
    'item_total_contacts',
    'item_contact_ratio',
    'item_age_days',
    'days_since_item_last_interaction',
    'category_total_interactions',
    'category_unique_items',
    'category_unique_users',
    'category_total_contacts',
    'category_contact_ratio',
    'item_user_interaction_product',
    'user_item_contact_affinity',
    'is_new_item',
    'is_recently_active_user',
    'item_category_popularity_ratio'
]

cat_cols = ["category"]

target_col = "is_contact"

In [39]:
model = CatBoostRanker(
    boosting_type="Plain",
    iterations=300,
    learning_rate=0.05,
    depth=8,
    objective="YetiRankPairwise:top=40,permutations=10",
    bagging_temperature=0.8,
    bootstrap_type="Bayesian",
    random_seed=42,
    eval_metric="RecallAt:top=40",
    verbose=10,
    nan_mode="Min",
    task_type="GPU"
)

In [40]:
train_data_boosting = train_data_for_boosting.sample(fraction=0.8, seed=123).sort("cookie")

valid_data_boosting = train_data_for_boosting.join(train_data_boosting, on=train_data_boosting.columns, how="anti").sort("cookie")

In [41]:
train_data_boosting = Pool(train_data_boosting.select(feature_cols).to_pandas(), train_data_boosting.select(target_col).to_pandas(), cat_features=cat_cols, group_id=train_data_boosting["cookie"].to_list())

valid_data_boosting = Pool(valid_data_boosting.select(feature_cols).to_pandas(), valid_data_boosting.select(target_col).to_pandas(), cat_features=cat_cols, group_id=valid_data_boosting["cookie"].to_list())

In [42]:
model.fit(
    train_data_boosting,
    eval_set=valid_data_boosting
)

Groupwise loss function. OneHotMaxSize set to 10


Default metric period is 5 because PFound, RecallAt is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=40 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.9995886	test: 1.0000000	best: 1.0000000 (0)	total: 1.51s	remaining: 7m 32s
10:	learn: 0.9997752	test: 1.0000000	best: 1.0000000 (0)	total: 16.1s	remaining: 7m 2s
20:	learn: 0.9998176	test: 1.0000000	best: 1.0000000 (0)	total: 30.2s	remaining: 6m 41s
30:	learn: 0.9998388	test: 1.0000000	best: 1.0000000 (0)	total: 44.4s	remaining: 6m 24s
40:	learn: 0.9998388	test: 1.0000000	best: 1.0000000 (0)	total: 58.5s	remaining: 6m 9s
50:	learn: 0.9998486	test: 1.0000000	best: 1.0000000 (0)	total: 1m 12s	remaining: 5m 54s
60:	learn: 0.9998511	test: 1.0000000	best: 1.0000000 (0)	total: 1m 26s	remaining: 5m 40s
70:	learn: 0.9998511	test: 1.0000000	best: 1.0000000 (0)	total: 1m 40s	remaining: 5m 25s
80:	learn: 0.9998536	test: 1.0000000	best: 1.0000000 (0)	total: 1m 55s	remaining: 5m 10s
90:	learn: 0.9998594	test: 1.0000000	best: 1.0000000 (0)	total: 2m 9s	remaining: 4m 56s
100:	learn: 0.9998626	test: 1.0000000	best: 1.0000000 (0)	total: 2m 23s	remaining: 4m 42s
110:	learn: 0.9998626	test: 1

<catboost.core.CatBoostRanker at 0x7f5a52f78590>

In [43]:
model = CatBoostRanker(
    boosting_type="Plain",
    iterations=250,
    learning_rate=0.05,
    depth=8,
    objective="YetiRankPairwise:top=40,permutations=10",
    bagging_temperature=0.8,
    bootstrap_type="Bayesian",
    random_seed=42,
    eval_metric="RecallAt:top=40",
    verbose=10,
    nan_mode="Min",
    task_type="GPU"
)

In [44]:
train_data_for_boosting = train_data_for_boosting.sort("cookie")
train_data_boosting = Pool(train_data_for_boosting.select(feature_cols).to_pandas(), train_data_for_boosting.select(target_col).to_pandas(), cat_features=cat_cols, group_id=train_data_for_boosting["cookie"].to_list())

In [45]:
model.fit(
    train_data_boosting
)

Groupwise loss function. OneHotMaxSize set to 10


Default metric period is 5 because PFound, RecallAt is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=40 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.9990336	total: 1.49s	remaining: 6m 10s
10:	learn: 0.9994179	total: 16.4s	remaining: 5m 56s
20:	learn: 0.9995694	total: 31.5s	remaining: 5m 43s
30:	learn: 0.9995912	total: 46.7s	remaining: 5m 29s
40:	learn: 0.9996288	total: 1m 1s	remaining: 5m 12s
50:	learn: 0.9996258	total: 1m 16s	remaining: 4m 57s
60:	learn: 0.9996617	total: 1m 31s	remaining: 4m 42s
70:	learn: 0.9996617	total: 1m 45s	remaining: 4m 26s
80:	learn: 0.9996670	total: 1m 59s	remaining: 4m 10s
90:	learn: 0.9996674	total: 2m 14s	remaining: 3m 55s
100:	learn: 0.9996730	total: 2m 29s	remaining: 3m 40s
110:	learn: 0.9996946	total: 2m 44s	remaining: 3m 26s
120:	learn: 0.9996943	total: 2m 59s	remaining: 3m 11s
130:	learn: 0.9996966	total: 3m 14s	remaining: 2m 56s
140:	learn: 0.9997149	total: 3m 28s	remaining: 2m 40s
150:	learn: 0.9997206	total: 3m 42s	remaining: 2m 25s
160:	learn: 0.9997192	total: 3m 56s	remaining: 2m 10s
170:	learn: 0.9997354	total: 4m 10s	remaining: 1m 55s
180:	learn: 0.9997302	total: 4m 25s	remainin

<catboost.core.CatBoostRanker at 0x7f5a84f5a3d0>

In [49]:
del df_clickstream

# Оценка на тесте

In [46]:
df_train_retriever_and_reranker = pl.concat([df_train_retrivial, df_train_reranker])

if not df_valid_eval_reranker.is_empty():
    test_users = df_valid_eval_reranker["cookie"].unique().to_list()
else:
    test_users = df_test_users["cookie"].unique().to_list()

In [50]:
df_clean_retrivial_and_reranker = clean_interactions_for_als(df_train_retriever_and_reranker)
full_train_users = df_clean_retrivial_and_reranker["cookie"]
full_train_nodes = df_clean_retrivial_and_reranker["node"]
full_weight_for_als = df_clean_retrivial_and_reranker["als_weight"]

del df_clean_retrivial_and_reranker

  pl.count().alias("total_interactions"),


In [51]:
als_retriever = CFRetriever(implicit.als.AlternatingLeastSquares, iterations=30, factors=100, regularization=0.7, use_gpu=True, random_state=123)

als_retriever.fit(full_train_users, full_train_nodes, is_contact_col=full_weight_for_als)

test_als_pred = als_retriever.recommend(test_users, 300)

del full_train_users, full_train_nodes, full_weight_for_als

100%|██████████| 30/30 [01:03<00:00,  2.11s/it]


In [52]:
df_temporal_bpr_retriever_and_reranker = prepare_temporal_bpr_data(df_train_retriever_and_reranker, train_days_back=21)

full_train_users = df_temporal_bpr_retriever_and_reranker["cookie"]
full_train_nodes = df_temporal_bpr_retriever_and_reranker["node"]
full_contact_cols = df_temporal_bpr_retriever_and_reranker["is_contact"]

del df_temporal_bpr_retriever_and_reranker

In [53]:
bpr_retriever = CFRetriever(implicit.bpr.BayesianPersonalizedRanking, iterations=50, factors=64, regularization=0.001, learning_rate=0.01, use_gpu=True, random_state=123)

bpr_retriever.fit(full_train_users, full_train_nodes, is_contact_col=full_contact_cols)

test_bpr_pred = bpr_retriever.recommend(test_users, 300)

del full_train_users, full_train_nodes, full_contact_cols

100%|██████████| 50/50 [00:24<00:00,  2.06it/s, train_auc=94.76%, skipped=4.17%]


In [54]:
cadidates_df_full = test_als_pred.rename({"scores": "als_score"}).join(
    test_bpr_pred.rename({"scores": "bpr_score"}),
    on=["node", "cookie"],
    how="full",
    coalesce=True
)
# ).with_columns([
#     # Добавляем флаги наличия рекомендаций
#     pl.col("als_score").is_not_null().alias("has_als_rec"),
#     pl.col("bpr_score").is_not_null().alias("has_bpr_rec"),
    
#     # Заполляем отсутствующие значения
#     pl.col("als_score").fill_null(pl.col("als_score").min()),
#     pl.col("bpr_score").fill_null(pl.col("bpr_score").min())
# ]).with_columns([
#     # Считаем количество моделей, рекомендовавших товар
#     (pl.col("has_als_rec").cast(pl.Int32) + 
#      pl.col("has_bpr_rec").cast(pl.Int32)).alias("recommendation_count")
# ])

In [55]:
# Использование
df_candidates_with_scores_full = get_missing_scores(cadidates_df_full, als_retriever, "als")
df_candidates_with_scores_full = get_missing_scores(df_candidates_with_scores_full, bpr_retriever, "bpr")

del als_retriever, bpr_retriever
gc.collect()

100%|██████████| 34644/34644 [08:28<00:00, 68.13it/s]
100%|██████████| 35137/35137 [08:20<00:00, 70.20it/s]


0

In [56]:
test_data_for_boosting = prepare_features_for_boosting(
    df_candidates_with_scores_full,
    df_train_retriever_and_reranker,
    df_cat_features,
    # popular_count_week=popular_count_week,
    # popular_count_month=popular_count_month,
    # trending_count_week=trending_count_week,
).sort("cookie")

test_data_for_boosting.head()

node,cookie,als_score,bpr_score,category,user_total_interactions,user_unique_nodes,user_unique_events,user_total_contacts,user_contact_ratio,user_activity_span_days,days_since_user_last_activity,user_account_age_days,user_interactions_last_3d,user_unique_nodes_last_3d,user_contacts_last_3d,user_interactions_last_7d,user_unique_nodes_last_7d,user_contacts_last_7d,user_interactions_last_14d,user_unique_nodes_last_14d,user_contacts_last_14d,user_interactions_last_30d,user_unique_nodes_last_30d,user_contacts_last_30d,item_total_interactions,item_unique_users,item_total_contacts,item_contact_ratio,item_age_days,days_since_item_last_interaction,category_total_interactions,category_unique_items,category_unique_users,category_total_contacts,category_contact_ratio,item_user_interaction_product,user_item_contact_affinity,is_new_item,is_recently_active_user,item_category_popularity_ratio
i64,i64,f64,f64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i32,i32,f64
115834,0,0.860947,5.187847,19,258,83,10,16,0.062016,33,2,35,14,8,1,33,16,1,140,55,11,244,80,16,45120,7435,994,0.02203,36,0,2715706,4319,59853,64080,0.023596,59.553099,0.001366,0,1,0.016614
214235,0,0.748261,1.527505,37,258,83,10,16,0.062016,33,2,35,14,8,1,33,16,1,140,55,11,244,80,16,31742,10194,887,0.027944,36,0,2337661,258,87615,151546,0.064828,57.598898,0.001733,0,1,0.013579
115713,0,0.733352,3.348838,19,258,83,10,16,0.062016,33,2,35,14,8,1,33,16,1,140,55,11,244,80,16,42843,6326,1761,0.041104,36,0,2715706,4319,59853,64080,0.023596,59.265354,0.002549,0,1,0.015776
214234,0,0.72971,1.973447,37,258,83,10,16,0.062016,33,2,35,14,8,1,33,16,1,140,55,11,244,80,16,51690,17314,974,0.018843,36,0,2337661,258,87615,151546,0.064828,60.308471,0.001169,0,1,0.022112
1923,0,0.665244,3.484149,61,258,83,10,16,0.062016,33,2,35,14,8,1,33,16,1,140,55,11,244,80,16,108941,10782,4094,0.03758,36,0,316015,3,16557,13146,0.041599,64.451264,0.002331,0,1,0.344734


In [57]:
test_data = Pool(test_data_for_boosting.select(feature_cols).to_pandas(), cat_features=cat_cols, group_id=test_data_for_boosting["cookie"].to_list())

In [58]:
df_pred_catboost = (
    pl.DataFrame(
        {
            "node": test_data_for_boosting["node"],
            "cookie": test_data_for_boosting["cookie"],
            "catboost_score": model.predict(test_data),
        }
    )
    .sort(by=["cookie", "catboost_score"], descending=[False, True])
    .group_by("cookie")
    .head(200)
)

In [None]:
# df_pred_catboost[["cookie", "node"]].write_csv("../data/predict/baseline_als_bpr_catboost_v1.1.csv")

In [59]:
test_als_results = evaluate_models(
    df_valid_eval_reranker, test_als_pred, df_pred_catboost, k_values=[10, 20, 30, 40, 50, 60, 100, 150, 200]
)
print("\nРезультаты модели на тестовом наборе:")
for metric, value in test_als_results.items():
    print(f"{metric}: {value:.4f}")


Результаты модели на тестовом наборе:
CF_recall@10: 0.0683
Reranker_recall@10: 0.0689
CF_recall@20: 0.1036
Reranker_recall@20: 0.1100
CF_recall@30: 0.1306
Reranker_recall@30: 0.1420
CF_recall@40: 0.1522
Reranker_recall@40: 0.1677
CF_recall@50: 0.1711
Reranker_recall@50: 0.1906
CF_recall@60: 0.1882
Reranker_recall@60: 0.2093
CF_recall@100: 0.2417
Reranker_recall@100: 0.2697
CF_recall@150: 0.2872
Reranker_recall@150: 0.3193
CF_recall@200: 0.3243
Reranker_recall@200: 0.3541


In [None]:
test_als_results = evaluate_models(
    df_valid_eval_reranker, test_als_pred, df_pred_catboost, k_values=[10, 20, 30, 40, 50, 60, 100, 150, 200]
)
print("\nРезультаты модели на тестовом наборе:")
for metric, value in test_als_results.items():
    print(f"{metric}: {value:.4f}")


Результаты модели на тестовом наборе:
CF_recall@10: 0.0372
Reranker_recall@10: 0.0646
CF_recall@20: 0.0562
Reranker_recall@20: 0.1033
CF_recall@30: 0.0709
Reranker_recall@30: 0.1325
CF_recall@40: 0.0831
Reranker_recall@40: 0.1579
CF_recall@50: 0.0940
Reranker_recall@50: 0.1781
CF_recall@60: 0.1031
Reranker_recall@60: 0.1970
CF_recall@100: 0.1324
Reranker_recall@100: 0.2529
CF_recall@150: 0.1597
Reranker_recall@150: 0.2988
CF_recall@200: 0.1805
Reranker_recall@200: 0.3284
