# 03 Ranker data preprocessing

Combine candidates with articles and features data to be used as input to ranker.


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

In [None]:
# Initialize logging
file_log = "ranker_data_preprocessing.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

In [None]:
from src.input_preprocessing import LightGBMDataProcessorConfig, LightGBMDataPipeline

# Preprocess


In [None]:
data_processor_config_train = LightGBMDataProcessorConfig.create_default("train")
data_processor_config_valid = LightGBMDataProcessorConfig.create_default("valid")
data_processor_config_test = LightGBMDataProcessorConfig.create_default("test")

In [None]:
data_pipeline_train = LightGBMDataPipeline(data_processor_config_train)
data_pipeline_valid = LightGBMDataPipeline(data_processor_config_valid)
data_pipeline_test = LightGBMDataPipeline(data_processor_config_test)

In [None]:
data_pipeline_train = data_pipeline_train.setup()

In [None]:
results_train, _ = data_pipeline_train.run()

In [None]:
data_pipeline_valid = data_pipeline_valid.setup()

In [None]:
results_valid_train, results_valid_inference = data_pipeline_valid.run()

In [None]:
data_pipeline_test = data_pipeline_test.setup()

In [None]:
_, results_test_inference = data_pipeline_test.run()

# Debug


## Check output shapes and meta


In [None]:
from src.candidate_generator import CandidateGeneratorResult, get_path_to_candidates
from src.utils.data_checks import test_lightgbm_data_pipeline_metadata

In [None]:
subsample = data_processor_config_train.subsample
seed = data_processor_config_train.seed

In [None]:
path_to_candidates_train = get_path_to_candidates(sample="train", subsample=subsample, seed=seed)
path_to_candidates_valid = get_path_to_candidates(sample="valid", subsample=subsample, seed=seed)
path_to_candidates_test = get_path_to_candidates(sample="test", subsample=subsample, seed=seed)
candidates_train = CandidateGeneratorResult.load(path_to_candidates_train)
candidates_valid = CandidateGeneratorResult.load(path_to_candidates_valid)
candidates_test = CandidateGeneratorResult.load(path_to_candidates_test)

In [None]:
test_lightgbm_data_pipeline_metadata(results_train, candidates_train)

test_lightgbm_data_pipeline_metadata(results_valid_train, candidates_valid)
test_lightgbm_data_pipeline_metadata(results_valid_inference, candidates_valid)

test_lightgbm_data_pipeline_metadata(results_test_inference, candidates_test)

In [None]:
print(results_train.data.shape)
print(results_valid_train.data.shape)
print(results_valid_inference.data.shape)

In [None]:
print(results_train.data.columns)
print(results_valid_train.data.columns)
print(results_valid_inference.data.columns)

In [None]:
print(results_test_inference.data.shape)
print(results_test_inference.data.columns)

In [None]:
results_test_inference.get_feature_names_list()

In [None]:
results_train.use_type, results_valid_train.use_type, results_valid_inference.use_type, results_test_inference.use_type

## Check feature distribution


In [None]:
from src.utils.data_checks import test_input_articles_dynamic_features
from src.features_articles import ArticleDynamicFeatureResult, ArticleEmbeddingResult, get_path_to_article_features
from src.utils.data_checks import test_lightgbm_data_features
from src.feature_customers import CustomerDynamicFeatureResult, get_path_to_customers_features

path_to_customers_features_train = get_path_to_customers_features(
    feature_type="dynamic", subsample=subsample, seed=seed
)
customer_dynamic_features = CustomerDynamicFeatureResult.load(path_to_customers_features_train)


path_to_article_features_train = get_path_to_article_features(feature_type="embedding", subsample=1, seed=42)
article_embeddings = ArticleEmbeddingResult.load(path_to_article_features_train)

In [None]:
test_lightgbm_data_features(results_train, verbose=True)

In [None]:
test_lightgbm_data_features(results_valid_train)

test_lightgbm_data_features(results_valid_inference)

test_lightgbm_data_features(results_test_inference)

### Embedding similarity


In [None]:
from src.utils.data_checks import test_input_embedding_similarity, test_input_customer_avg_price

In [None]:
customer_ids = results_train.data.customer_id.unique()[:3]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_embedding_similarity(results_train, customer_dynamic_features, article_embeddings, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_train.data.customer_id.unique()[:5]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_customer_avg_price(results_train, customer_dynamic_features, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_valid_inference.data.customer_id.unique()[:3]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_embedding_similarity(results_valid_inference, customer_dynamic_features, article_embeddings, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_valid_train.data.customer_id.unique()[:5]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_customer_avg_price(results_valid_train, customer_dynamic_features, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_valid_inference.data.customer_id.unique()[:3]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_embedding_similarity(results_valid_inference, customer_dynamic_features, article_embeddings, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_valid_inference.data.customer_id.unique()[:5]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_customer_avg_price(results_valid_inference, customer_dynamic_features, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_test_inference.data.customer_id.unique()[:3]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_embedding_similarity(results_test_inference, customer_dynamic_features, article_embeddings, customer_id)
    print("=" * 100)
    print("")

In [None]:
customer_ids = results_test_inference.data.customer_id.unique()[:5]
for customer_id in customer_ids:
    print(f"Testing customer {customer_id}")
    test_input_customer_avg_price(results_test_inference, customer_dynamic_features, customer_id)
    print("=" * 100)
    print("")

## Articles dynamic features

- check that we join with the correct weeks


In [None]:
from src.utils.data_checks import test_input_articles_dynamic_features
from src.features_articles import ArticleDynamicFeatureResult, get_path_to_article_features

In [None]:
path_to_article_features_train = get_path_to_article_features(feature_type="dynamic", subsample=subsample, seed=seed)
article_dynamic_features = ArticleDynamicFeatureResult.load(path_to_article_features_train)

In [None]:
test_input_articles_dynamic_features(article_dynamic_features, results_train)