# 04c Adhoc ranker training


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

# Initialize logging
file_log = "train_ranker.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

In [None]:
from src.ranker import RankerTrainValidPipeline, RankerConfig
from src.config import generate_config_evaluator_pipeline

# from src.eval import RankerEvaluatorPipeline

# Model training


In [None]:
default_ranker_pipeline_config = RankerConfig.get_default_config()

In [None]:
ranker_train_valid_pipeline = RankerTrainValidPipeline(config=default_ranker_pipeline_config)

In [None]:
ranker_train_valid_pipeline.setup()

In [None]:
metics, feature_importance, run_id = ranker_train_valid_pipeline.run()

In [None]:
print(run_id)
print(metics)
display(feature_importance)
# path_dir_model = ranker_train_valid_pipeline._get_path_to_dir()

# Baseline and ideal case


In [None]:
# Baseline: if we guess the most popular items from last month
from src.feature_extraction import load_optimized_raw_data
from src.metrics import (
    mean_average_precision_at_k,
    get_mapping_from_labels,
    mean_average_precision_at_k_hierarchical,
    ideal_mean_average_precision_at_k,
)
from src.utils.popularity import calculate_weekly_popular_items, calculate_rolling_popular_items
from src.input_preprocessing import get_path_to_lightgbm_data, LightGBMDataResult

In [None]:
transactions_train = load_optimized_raw_data(
    data_type="transactions",
    sample="train",
    subsample=default_ranker_pipeline_config.subsample,
    seed=default_ranker_pipeline_config.seed,
)
transactions_valid = load_optimized_raw_data(
    data_type="transactions",
    sample="valid",
    subsample=default_ranker_pipeline_config.subsample,
    seed=default_ranker_pipeline_config.seed,
)
transactions_test = load_optimized_raw_data(
    data_type="transactions",
    sample="test",
    subsample=default_ranker_pipeline_config.subsample,
    seed=default_ranker_pipeline_config.seed,
)
transactions = pd.concat([transactions_train, transactions_valid, transactions_test], ignore_index=True)
del transactions_train, transactions_valid, transactions_test

In [None]:
valid_mapping = load_optimized_raw_data(
    data_type="candidates_to_articles_mapping",
    sample="valid",
    subsample=default_ranker_pipeline_config.subsample,
    seed=default_ranker_pipeline_config.seed,
)

In [None]:
valid_inference_data = LightGBMDataResult.load(
    get_path_to_lightgbm_data(
        sample="valid",
        use_type="inference",
        subsample=default_ranker_pipeline_config.subsample,
        seed=default_ranker_pipeline_config.seed,
    )
)

In [None]:
# Generate heuristic prediction for valid dataset
week_num_valid = 103
# heuristic_pred = calculate_weekly_popular_items(
#     transactions.query("week_num == @week_num_valid - 1"), 12, "week_num", "article_id"
# ).article_id.to_list()
heuristic_pred = (
    calculate_rolling_popular_items(
        transactions.query("week_num >= @week_num_valid - 1 and week_num < @week_num_valid"),
        1,
        12,
        "week_num",
        "article_id",
    )
    .query("week_num == @week_num_valid - 1")["article_id"]
    .to_list()
)

heuristic_pred_mapping = {k: heuristic_pred for k in valid_mapping.keys()}

mean_average_precision_at_k(valid_mapping, heuristic_pred_mapping, k=12)

In [None]:
# Ideal case
# Out of all candidates, if we can rank the correct ones first, what is the MAP@K?
# Calculate the best possible ranking
valid_inference_data.data.head()

# Evaluate valid
valid_inference_pred = ranker_train_valid_pipeline.ranker.predict_ranks(valid_inference_data)

# Double check the results above
print("MAP@K", mean_average_precision_at_k(valid_mapping, valid_inference_pred, k=12))
print("MAP@K ideal", ideal_mean_average_precision_at_k(valid_mapping, valid_inference_pred, k=12))

# === Understand below; to clean up ===


# Debug - MAPK


In [None]:
from src.input_preprocessing import get_path_to_lightgbm_data, LightGBMDataResult
from src.feature_extraction import load_optimized_raw_data
from src.metrics import mean_average_precision_at_k, get_mapping_from_labels, mean_average_precision_at_k_hierarchical

In [None]:
path_train = get_path_to_lightgbm_data(sample="train", use_type="train", subsample=0.05, seed=42)
path_valid_train = get_path_to_lightgbm_data(sample="valid", use_type="train", subsample=0.05, seed=42)
path_valid_inference = get_path_to_lightgbm_data(sample="valid", use_type="inference", subsample=0.05, seed=42)

In [None]:
train_data = LightGBMDataResult.load(path_train)
valid_train_data = LightGBMDataResult.load(path_valid_train)
valid_inference_data = LightGBMDataResult.load(path_valid_inference)

In [None]:
# Evaluate train
train_scores = ranker_train_valid_pipeline.ranker.predict_scores(train_data)

In [None]:
train_scores.shape

In [None]:
train_df = train_data.data.copy()
train_df["score"] = train_scores
train_df.head()

In [None]:
# Get pred ranking for the last week
# train_df = train_df.query("week_num == 102")
train_df.sort_values("score", ascending=False, inplace=True)
# train_preds = train_df.groupby("customer_id")["article_id"].apply(list)

In [None]:
# Transform DataFrame into nested dictionary structure
train_preds = {}
for week in train_df["week_num"].unique():
    week_df = train_df[train_df["week_num"] == week]
    train_preds[week] = week_df.groupby("customer_id")["article_id"].apply(list).to_dict()

In [None]:
len(train_preds), list(train_preds.keys())

In [None]:
train_mapping = get_mapping_from_labels(train_df, "label")

In [None]:
len(train_preds), list(train_preds.keys())

In [None]:
len(train_mapping[101]), len(train_preds[101])

In [None]:
tmp = mean_average_precision_at_k_hierarchical(train_mapping, train_preds, k=12)

In [None]:
print(tmp)

In [None]:
tmp = mean_average_precision_at_k(train_mapping[102], train_preds[102], k=12)

In [None]:
for k in list(train_mapping[102].keys())[:10]:
    print(k)
    print(train_mapping[102][k])
    print(train_preds[102][k])
    print("-" * 100)

In [None]:
train_df[["customer_id", "week_num", "score", "label"]].sort_values(
    ["customer_id", "week_num", "score"], ascending=[True, True, False]
).head(20)

## Valid train


In [None]:
valid_train_scores = ranker_train_valid_pipeline.ranker.predict_scores(valid_train_data)
valid_train_df = valid_train_data.data.copy()
valid_train_df["score"] = valid_train_scores
valid_train_df.head()

In [None]:
valid_train_df.sort_values("score", ascending=False, inplace=True)
valid_train_preds = valid_train_df.groupby("customer_id")["article_id"].apply(list)

In [None]:
valid_train_mapping = get_mapping_from_labels(valid_train_df, "label")

In [None]:
tmp1 = mean_average_precision_at_k(valid_train_mapping, valid_train_preds, k=12)

In [None]:
print(tmp1)

In [None]:
# Make sure we are looking at the right data
valid_train_data.data.shape, valid_inference_data.data.shape

In [None]:
valid_train_data.label.sum()

In [None]:
comp = valid_train_data.data.merge(
    valid_inference_data.data, on=["customer_id", "article_id"], how="outer", indicator=True
)

In [None]:
comp._merge.value_counts()

In [None]:
# Anything in the left side only must be 1
comp.query("_merge == 'left_only'").label.mean()

In [None]:
valid_train_df[["customer_id", "week_num", "score", "label"]].sort_values(
    ["customer_id", "week_num", "score"], ascending=[True, True, False]
).head(20)

# [Archived] Evaluation


In [None]:
# config_evaluator_pipeline = generate_config_evaluator_pipeline(
#     ranker_path=path_dir_model,
#     sample="valid",
#     subsample=0.05,
#     seed=42,
# )

In [None]:
# ranker_evaluator_pipeline = RankerEvaluatorPipeline(config_evaluator_pipeline)

In [None]:
# ranker_evaluator_pipeline.run()