# 02 Candidate generation


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

In [3]:
# Initialize logging
file_log = "candidate_generation.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

2025-06-03 14:56:28,735 - src.utils.core_utils - INFO - Creating file handler with level: 10
2025-06-03 14:56:28,735 - src.utils.core_utils - DEBUG - Logging setup complete to candidate_generation.log


In [4]:
from src.candidate_generator import CandidateGeneratorPipeline, CandidateGeneratorPipelineConfig

# Config


In [5]:
config_candidate_generation = CandidateGeneratorPipelineConfig.create_default()

## Candidate generation


In [6]:
candidate_generator_pipeline = CandidateGeneratorPipeline(config=config_candidate_generation)

In [7]:
candidate_generator_pipeline = candidate_generator_pipeline.setup()

2025-06-03 14:56:28,781 - src.candidate_generator - INFO - Setting up CandidateGeneratorPipeline
2025-06-03 14:56:28,781 - src.candidate_generator - DEBUG - Pipeline config: {
  "train_start_date": "2020-07-29T00:00:00",
  "train_end_date": "2020-09-08T00:00:00",
  "n_sample_week_threshold_history": -1,
  "negative_sample_strategies": {
    "popularity": {
      "top_k_items": 30
    },
    "repurchase": {}
  },
  "inference_sample_strategies": {
    "popularity": {
      "top_k_items": 30
    },
    "repurchase": {}
  },
  "subsample": 0.05,
  "seed": 42
}


In [8]:
candidates_train, candidates_valid, candidates_test = candidate_generator_pipeline.run()

2025-06-03 14:56:28,789 - src.candidate_generator - INFO - Loading data for CandidateGeneratorPipeline
2025-06-03 14:56:28,789 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.05 42
2025-06-03 14:56:28,790 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.05_42.parquet
2025-06-03 14:56:28,878 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.05 42
2025-06-03 14:56:28,879 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.05_42.parquet
2025-06-03 14:56:28,881 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.05 42
2025-06-03 14:56:28,882 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.05_42.parquet
2025-06-03 14:56:28,884 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.05 42
2025-06-03 14:56:28,884 - src.feature_extraction - DEB

## For debug


In [9]:
print(list(candidates_train.__dict__.keys()))
print(list(candidates_valid.__dict__.keys()))
print(list(candidates_test.__dict__.keys()))

['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']
['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']
['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']


In [10]:
print(candidates_train.get_feature_list())
print(candidates_valid.get_feature_list())
print(candidates_test.get_feature_list())

['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']
['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']
['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']


In [11]:
print(candidates_train.data.shape)
print(candidates_valid.data.shape)
print(candidates_valid.data_inference.shape)
print(candidates_test.data.shape)
print(candidates_test.data_inference.shape)

(794914, 11)
(127726, 11)
(116834, 10)
(121636, 11)
(111602, 10)


In [12]:
print(candidates_valid.default_prediction.shape)
print(candidates_test.default_prediction.shape)

(30,)
(30,)


In [13]:
# Look at candidate sources
print("Train")
display(candidates_train.data.source.value_counts())
print("Valid: train")
display(candidates_valid.data.source.value_counts())
print("Valid: inference")
display(candidates_valid.data_inference.source.value_counts())
print("Test: train")
display(candidates_test.data.source.value_counts())
print("Test: inference")
display(candidates_test.data_inference.source.value_counts())

Train


source
popularity    688747
positive       75444
repurchase     30723
Name: count, dtype: int64

Valid: train


source
popularity    109272
positive       11558
repurchase      6896
Name: count, dtype: int64

Valid: inference


source
popularity    109710
repurchase      7124
Name: count, dtype: int64

Test: train


source
popularity    104093
positive       10794
repurchase      6749
Name: count, dtype: int64

Test: inference


source
popularity    104640
repurchase      6962
Name: count, dtype: int64

In [14]:
candidates_train.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794914 entries, 0 to 794913
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_id       794914 non-null  uint64 
 1   article_id        794914 non-null  int32  
 2   price             794914 non-null  float32
 3   sales_channel_id  794914 non-null  int8   
 4   week_num          794914 non-null  int64  
 5   source            794914 non-null  object 
 6   label             794914 non-null  int64  
 7   week              794914 non-null  object 
 8   year              794914 non-null  int32  
 9   month             794914 non-null  int32  
 10  bestseller_rank   794914 non-null  float64
dtypes: float32(1), float64(1), int32(3), int64(2), int8(1), object(2), uint64(1)
memory usage: 49.3+ MB


# Checks


In [15]:
from src.feature_extraction import load_optimized_raw_data
from src.utils.data_checks import test_candidate_generator, test_candidate_generator_pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
transactions_train = load_optimized_raw_data(
    "transactions", "train", config_candidate_generation.subsample, config_candidate_generation.seed
)
transactions_valid = load_optimized_raw_data(
    "transactions", "valid", config_candidate_generation.subsample, config_candidate_generation.seed
)
transactions_test = load_optimized_raw_data(
    "transactions", "test", config_candidate_generation.subsample, config_candidate_generation.seed
)

2025-06-03 14:56:36,604 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.05 42
2025-06-03 14:56:36,605 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.05_42.parquet
2025-06-03 14:56:36,625 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.05 42
2025-06-03 14:56:36,626 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.05_42.parquet
2025-06-03 14:56:36,629 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.05 42
2025-06-03 14:56:36,629 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.05_42.parquet


In [17]:
transactions = pd.concat([transactions_train, transactions_valid, transactions_test])

In [18]:
config_candidate_generation.__dict__

{'train_start_date': Timestamp('2020-07-29 00:00:00'),
 'train_end_date': Timestamp('2020-09-08 00:00:00'),
 'n_sample_week_threshold_history': -1,
 'negative_sample_strategies': {'popularity': {'top_k_items': 30},
  'repurchase': {}},
 'inference_sample_strategies': {'popularity': {'top_k_items': 30},
  'repurchase': {}},
 'subsample': 0.05,
 'seed': 42}

In [19]:
# Test for meta data
test_candidate_generator_pipeline(candidates_train, transactions, config_candidate_generation)
test_candidate_generator_pipeline(candidates_valid, transactions, config_candidate_generation)
test_candidate_generator_pipeline(candidates_test, transactions, config_candidate_generation)

Has data: True, sample: train
Checking week range
Checking default prediction
Checking number of customers
Checks passed
Has data: True, sample: valid
Checking week range
Checking default prediction
Checking number of customers
Checks passed
Has data: True, sample: test
Checking week range
Checking default prediction
Checking number of customers
Checks passed


In [20]:
customer_ids = candidates_train.data.customer_id.unique()[:10]
test_candidate_generator(candidates_train, transactions, config_candidate_generation, customer_ids)

Sample: train
Customer 10019067371664522
Customer 10019067371664522 has 2 weeks: [97 99]
Week 97
Checking 4 article_ids in week 99
Checks for customer 10019067371664522 and week 97 passed
--------------------------------------------------------------------------------
Week 99
No next purchase week for customer 10019067371664522 and week 99 because it's the last week in train dataset

Customer 10698768556209946
Customer 10698768556209946 has 5 weeks: [ 97  98  99 100 101]
Week 97
Checking 7 article_ids in week 98
Checks for customer 10698768556209946 and week 97 passed
--------------------------------------------------------------------------------
Week 98
Checking 3 article_ids in week 99
Checks for customer 10698768556209946 and week 98 passed
--------------------------------------------------------------------------------
Week 99
Checking 3 article_ids in week 100
Checks for customer 10698768556209946 and week 99 passed
----------------------------------------------------------------

In [21]:
customer_ids = candidates_valid.data.customer_id.unique()[:10]
test_candidate_generator(candidates_valid, transactions, config_candidate_generation, customer_ids)

Sample: valid
Customer 8383252499052781
Customer 8383252499052781 has 1 weeks: [101]
Week 101
Checking 1 article_ids in week 103
Checks for customer 8383252499052781 and week 101 passed
--------------------------------------------------------------------------------

Customer 14401014487501724
Customer 14401014487501724 has 1 weeks: [102]
Week 102
Checking 2 article_ids in week 103
Checks for customer 14401014487501724 and week 102 passed
--------------------------------------------------------------------------------

Customer 72822669774031389
Customer 72822669774031389 has 0 weeks: []

Customer 82992461741828036
Customer 82992461741828036 has 1 weeks: [101]
Week 101
Checking 15 article_ids in week 103
Checks for customer 82992461741828036 and week 101 passed
--------------------------------------------------------------------------------

Customer 117698131703306313
Customer 117698131703306313 has 1 weeks: [101]
Week 101
Checking 6 article_ids in week 103
Checks for customer 1176981

In [22]:
customer_ids = candidates_test.data.customer_id.unique()[:10]
test_candidate_generator(candidates_test, transactions, config_candidate_generation, customer_ids)

Sample: test
Customer 81250068469314753
Customer 81250068469314753 has 1 weeks: [103]
Week 103
Checking 5 article_ids in week 104
Checks for customer 81250068469314753 and week 103 passed
--------------------------------------------------------------------------------

Customer 117698131703306313
Customer 117698131703306313 has 1 weeks: [103]
Week 103
Checking 1 article_ids in week 104
Checks for customer 117698131703306313 and week 103 passed
--------------------------------------------------------------------------------

Customer 132612170747534254
Customer 132612170747534254 has 0 weeks: []

Customer 202827240789788864
Customer 202827240789788864 has 0 weeks: []

Customer 290365578982007187
Customer 290365578982007187 has 1 weeks: [97]
Week 97
Checking 1 article_ids in week 104
Checks for customer 290365578982007187 and week 97 passed
--------------------------------------------------------------------------------

Customer 315841156395478830
Customer 315841156395478830 has 1 weeks