# 02 Candidate generation


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging, set_seed

In [None]:
# Initialize logging
file_log = "candidate_generation.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

# Set seed
set_seed(123)

In [None]:
from src.candidate_generator import CandidateGeneratorPipeline, CandidateGeneratorPipelineConfig

# Config


In [None]:
config_candidate_generation = CandidateGeneratorPipelineConfig.create_default()

## Candidate generation


In [None]:
candidate_generator_pipeline = CandidateGeneratorPipeline(config=config_candidate_generation)

In [None]:
candidate_generator_pipeline = candidate_generator_pipeline.setup()

In [None]:
candidates_train, candidates_valid, candidates_test = candidate_generator_pipeline.run()

## For debug


In [None]:
print(list(candidates_train.__dict__.keys()))
print(list(candidates_valid.__dict__.keys()))
print(list(candidates_test.__dict__.keys()))

In [None]:
print(candidates_train.get_feature_list())
print(candidates_valid.get_feature_list())
print(candidates_test.get_feature_list())

In [None]:
print(candidates_train.data.shape)
print(candidates_valid.data.shape)
print(candidates_valid.data_inference.shape)
print(candidates_test.data.shape)
print(candidates_test.data_inference.shape)

In [None]:
print(candidates_valid.default_prediction.shape)
print(candidates_test.default_prediction.shape)

In [None]:
# Look at candidate sources
print("Train")
display(candidates_train.data.source.value_counts())
print("Valid: train")
display(candidates_valid.data.source.value_counts())
print("Valid: inference")
display(candidates_valid.data_inference.source.value_counts())
print("Test: train")
display(candidates_test.data.source.value_counts())
print("Test: inference")
display(candidates_test.data_inference.source.value_counts())

In [None]:
candidates_train.data.info()

# Checks


In [None]:
from src.feature_extraction import load_optimized_raw_data
from src.utils.data_checks import test_candidate_generator, test_candidate_generator_pipeline

In [None]:
transactions_train = load_optimized_raw_data(
    "transactions", "train", config_candidate_generation.subsample, config_candidate_generation.seed
)
transactions_valid = load_optimized_raw_data(
    "transactions", "valid", config_candidate_generation.subsample, config_candidate_generation.seed
)
transactions_test = load_optimized_raw_data(
    "transactions", "test", config_candidate_generation.subsample, config_candidate_generation.seed
)

In [None]:
transactions = pd.concat([transactions_train, transactions_valid, transactions_test])

In [None]:
config_candidate_generation.__dict__

In [None]:
# Test for meta data
test_candidate_generator_pipeline(candidates_train, transactions, config_candidate_generation)
test_candidate_generator_pipeline(candidates_valid, transactions, config_candidate_generation)
test_candidate_generator_pipeline(candidates_test, transactions, config_candidate_generation)

In [None]:
customer_ids = candidates_train.data.customer_id.unique()[:10]
test_candidate_generator(candidates_train, transactions, config_candidate_generation, customer_ids)

In [None]:
customer_ids = candidates_valid.data.customer_id.unique()[:10]
test_candidate_generator(candidates_valid, transactions, config_candidate_generation, customer_ids)

In [None]:
customer_ids = candidates_test.data.customer_id.unique()[:10]
test_candidate_generator(candidates_test, transactions, config_candidate_generation, customer_ids)

# Adhoc


In [None]:
candidates_train.data.groupby(["source", "label"]).size()

In [None]:
candidates_valid.data.groupby(["source", "label"]).size()

In [None]:
candidates_test.data.groupby(["source", "label"]).size()