# 02 Candidate generation


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging, set_seed

In [3]:
# Initialize logging
file_log = "candidate_generation.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

# Set seed
set_seed(123)

2025-06-05 12:43:13,299 - src.utils.core_utils - INFO - Creating file handler with level: 10
2025-06-05 12:43:13,300 - src.utils.core_utils - DEBUG - Logging setup complete to candidate_generation.log


In [4]:
from src.candidate_generator import CandidateGeneratorPipeline, CandidateGeneratorPipelineConfig

# Config


In [5]:
config_candidate_generation = CandidateGeneratorPipelineConfig.create_default()

## Candidate generation


In [6]:
candidate_generator_pipeline = CandidateGeneratorPipeline(config=config_candidate_generation)

In [7]:
candidate_generator_pipeline = candidate_generator_pipeline.setup()

2025-06-05 12:43:13,358 - src.candidate_generator - INFO - Setting up CandidateGeneratorPipeline
2025-06-05 12:43:13,359 - src.candidate_generator - DEBUG - Pipeline config: {
  "train_start_date": "2020-07-01T00:00:00",
  "train_end_date": "2020-09-08T00:00:00",
  "history_start_date": "2020-01-01T00:00:00",
  "n_sample_week_threshold": -1,
  "negative_sample_strategies": {
    "popularity": {
      "top_k_items": 30
    },
    "repurchase": {
      "strategy": "last_k_items",
      "k": 12
    }
  },
  "inference_sample_strategies": {
    "popularity": {
      "top_k_items": 30
    },
    "repurchase": {
      "strategy": "last_k_items",
      "k": 12
    }
  },
  "subsample": 0.25,
  "seed": 42,
  "restrict_positive_samples": true,
  "neg_to_pos_ratio": 30.0
}


In [8]:
candidates_train, candidates_valid, candidates_test = candidate_generator_pipeline.run()

2025-06-05 12:43:13,368 - src.candidate_generator - INFO - Loading data for CandidateGeneratorPipeline
2025-06-05 12:43:13,368 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.25 42
2025-06-05 12:43:13,369 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.25_42.parquet
2025-06-05 12:43:14,263 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.25 42
2025-06-05 12:43:14,264 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.25_42.parquet
2025-06-05 12:43:14,275 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.25 42
2025-06-05 12:43:14,276 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.25_42.parquet
2025-06-05 12:43:14,286 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.25 42
2025-06-05 12:43:14,287 - src.feature_extraction - DEB

## For debug


In [9]:
print(list(candidates_train.__dict__.keys()))
print(list(candidates_valid.__dict__.keys()))
print(list(candidates_test.__dict__.keys()))

['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']
['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']
['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']


In [10]:
print(candidates_train.get_feature_list())
print(candidates_valid.get_feature_list())
print(candidates_test.get_feature_list())

['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']
['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']
['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']


In [11]:
print(candidates_train.data.shape)
print(candidates_valid.data.shape)
print(candidates_valid.data_inference.shape)
print(candidates_test.data.shape)
print(candidates_test.data_inference.shape)

(1062302, 11)
(113942, 11)
(678339, 10)
(127099, 11)
(656391, 10)


In [12]:
print(candidates_valid.default_prediction.shape)
print(candidates_test.default_prediction.shape)

(30,)
(30,)


In [13]:
# Look at candidate sources
print("Train")
display(candidates_train.data.source.value_counts())
print("Valid: train")
display(candidates_valid.data.source.value_counts())
print("Valid: inference")
display(candidates_valid.data_inference.source.value_counts())
print("Test: train")
display(candidates_test.data.source.value_counts())
print("Test: inference")
display(candidates_test.data_inference.source.value_counts())

Train


source
repurchase    611826
popularity    450476
Name: count, dtype: int64

Valid: train


source
popularity    89759
repurchase    24183
Name: count, dtype: int64

Valid: inference


source
popularity    537210
repurchase    141129
Name: count, dtype: int64

Test: train


source
popularity    99915
repurchase    27184
Name: count, dtype: int64

Test: inference


source
popularity    518700
repurchase    137691
Name: count, dtype: int64

In [14]:
candidates_train.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062302 entries, 0 to 1062301
Data columns (total 11 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   customer_id       1062302 non-null  uint64 
 1   week_num          1062302 non-null  int64  
 2   article_id        1062302 non-null  int32  
 3   price             1062302 non-null  float32
 4   sales_channel_id  1062302 non-null  int8   
 5   source            1062302 non-null  object 
 6   label             1062302 non-null  int64  
 7   week              1062302 non-null  object 
 8   year              1062302 non-null  int32  
 9   month             1062302 non-null  int32  
 10  bestseller_rank   1062302 non-null  float64
dtypes: float32(1), float64(1), int32(3), int64(2), int8(1), object(2), uint64(1)
memory usage: 65.9+ MB


# Checks


In [15]:
from src.feature_extraction import load_optimized_raw_data
from src.utils.data_checks import test_candidate_generator, test_candidate_generator_pipeline

2025-06-05 12:44:03,160 - matplotlib - DEBUG - matplotlib data path: /Users/sharonshiao/Dropbox/Machine_learning/projects/fahsion_recommendation/venv/lib/python3.12/site-packages/matplotlib/mpl-data
2025-06-05 12:44:03,165 - matplotlib - DEBUG - CONFIGDIR=/Users/sharonshiao/.matplotlib
2025-06-05 12:44:03,206 - matplotlib - DEBUG - interactive is False
2025-06-05 12:44:03,206 - matplotlib - DEBUG - platform is darwin
2025-06-05 12:44:03,289 - matplotlib - DEBUG - CACHEDIR=/Users/sharonshiao/.matplotlib
2025-06-05 12:44:03,294 - matplotlib.font_manager - DEBUG - Using fontManager instance from /Users/sharonshiao/.matplotlib/fontlist-v390.json
  from .autonotebook import tqdm as notebook_tqdm


In [16]:
transactions_train = load_optimized_raw_data(
    "transactions", "train", config_candidate_generation.subsample, config_candidate_generation.seed
)
transactions_valid = load_optimized_raw_data(
    "transactions", "valid", config_candidate_generation.subsample, config_candidate_generation.seed
)
transactions_test = load_optimized_raw_data(
    "transactions", "test", config_candidate_generation.subsample, config_candidate_generation.seed
)

2025-06-05 12:44:18,022 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.25 42
2025-06-05 12:44:18,023 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.25_42.parquet
2025-06-05 12:44:18,075 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.25 42
2025-06-05 12:44:18,076 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.25_42.parquet
2025-06-05 12:44:18,084 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.25 42
2025-06-05 12:44:18,084 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.25_42.parquet


In [17]:
transactions = pd.concat([transactions_train, transactions_valid, transactions_test])

In [18]:
config_candidate_generation.__dict__

{'train_start_date': Timestamp('2020-07-01 00:00:00'),
 'train_end_date': Timestamp('2020-09-08 00:00:00'),
 'history_start_date': Timestamp('2020-01-01 00:00:00'),
 'n_sample_week_threshold': -1,
 'negative_sample_strategies': {'popularity': {'top_k_items': 30},
  'repurchase': {'strategy': 'last_k_items', 'k': 12}},
 'inference_sample_strategies': {'popularity': {'top_k_items': 30},
  'repurchase': {'strategy': 'last_k_items', 'k': 12}},
 'subsample': 0.25,
 'seed': 42,
 'restrict_positive_samples': True,
 'neg_to_pos_ratio': 30.0}

In [19]:
# Test for meta data
test_candidate_generator_pipeline(candidates_train, transactions, config_candidate_generation)
test_candidate_generator_pipeline(candidates_valid, transactions, config_candidate_generation)
test_candidate_generator_pipeline(candidates_test, transactions, config_candidate_generation)

Has data: True, sample: train
Checking week range
Checking default prediction
Checking number of customers
Checks passed
Has data: True, sample: valid
Checking week range
Checking default prediction
Checking number of customers
Checks passed
Has data: True, sample: test
Checking week range
Checking default prediction
Checking number of customers
Checks passed


In [20]:
customer_ids = candidates_train.data.customer_id.unique()[:10]
test_candidate_generator(candidates_train, transactions, config_candidate_generation, customer_ids)

Testing candidate generator
Sample: train
Testing positive examples
Testing negative examples
Skipping repurchases test because neg_to_pos_ratio is not -1
Checks passed


In [21]:
customer_ids = candidates_valid.data.customer_id.unique()[:10]
test_candidate_generator(candidates_valid, transactions, config_candidate_generation, customer_ids)

Testing candidate generator
Sample: valid
Testing positive examples
Testing negative examples
Skipping repurchases test because neg_to_pos_ratio is not -1
Checks passed


In [22]:
customer_ids = candidates_test.data.customer_id.unique()[:10]
test_candidate_generator(candidates_test, transactions, config_candidate_generation, customer_ids)

Testing candidate generator
Sample: test
Testing positive examples
Testing negative examples
Skipping repurchases test because neg_to_pos_ratio is not -1
Checks passed


# Adhoc


In [23]:
candidates_train.data.groupby(["source", "label"]).size()

source      label
popularity  0        431794
            1         18682
repurchase  0        596226
            1         15600
dtype: int64

In [24]:
candidates_valid.data.groupby(["source", "label"]).size()

source      label
popularity  0        87581
            1         2178
repurchase  0        22779
            1         1404
dtype: int64

In [25]:
candidates_test.data.groupby(["source", "label"]).size()

source      label
popularity  0        97369
            1         2546
repurchase  0        25734
            1         1450
dtype: int64