# 02 Candidate generation


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

In [3]:
# Initialize logging
file_log = "candidate_generation.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

<RootLogger root (DEBUG)>

In [4]:
from src.candidate_generator import CandidateGeneratorPipeline, CandidatePipelineConfig

# Config


In [5]:
config_candidate_generation = CandidatePipelineConfig.create_default()

# Candidate generation


In [6]:
candidate_generator_pipeline = CandidateGeneratorPipeline(config=config_candidate_generation)

In [7]:
candidate_generator_pipeline = candidate_generator_pipeline.setup()

2025-05-29 21:41:45 - src.candidate_generator - INFO - Setting up CandidateGeneratorPipeline
2025-05-29 21:41:45 - src.candidate_generator - DEBUG - Pipeline config: {
  "train_start_date": "2020-07-29T00:00:00",
  "train_end_date": "2020-09-08T00:00:00",
  "n_sample_week_threshold_history": -1,
  "negative_sample_strategies": {
    "popularity": {
      "top_k_items": 12
    },
    "repurchase": {}
  },
  "inference_sample_strategies": {
    "popularity": {
      "top_k_items": 30
    },
    "repurchase": {}
  },
  "subsample": 0.05,
  "seed": 42
}


In [8]:
candidates_train, candidates_valid, candidates_test = candidate_generator_pipeline.run()

2025-05-29 21:41:45 - src.candidate_generator - INFO - Loading data for CandidateGeneratorPipeline
2025-05-29 21:41:45 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.05 42
2025-05-29 21:41:45 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.05_42.parquet
2025-05-29 21:41:45 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.05 42
2025-05-29 21:41:45 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.05_42.parquet
2025-05-29 21:41:45 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.05 42
2025-05-29 21:41:45 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.05_42.parquet
2025-05-29 21:41:45 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.05 42
2025-05-29 21:41:45 - src.feature_extraction - DEBUG - Loading data from ../data/artic

## For debug


In [9]:
print(list(candidates_train.__dict__.keys()))
print(list(candidates_valid.__dict__.keys()))
print(list(candidates_test.__dict__.keys()))

['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']
['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']
['data', 'data_inference', 'label', 'feature_names', 'sample', 'default_prediction']


In [10]:
print(candidates_train.get_feature_list())
print(candidates_valid.get_feature_list())
print(candidates_test.get_feature_list())

['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']
['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']
['month', 'price', 'week_num', 'bestseller_rank', 'customer_id', 'article_id', 'week', 'source', 'label']


In [11]:
print(candidates_train.data.shape)
print(candidates_valid.data.shape)
print(candidates_valid.data_inference.shape)
print(candidates_test.data.shape)
print(candidates_test.data_inference.shape)

(381965, 11)
(62233, 11)
(116834, 10)
(59253, 11)
(111602, 10)


In [12]:
candidates_train.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381965 entries, 0 to 381964
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_id       381965 non-null  uint64 
 1   article_id        381965 non-null  int32  
 2   price             381965 non-null  float32
 3   sales_channel_id  381965 non-null  int8   
 4   week_num          381965 non-null  int64  
 5   source            381965 non-null  object 
 6   label             381965 non-null  int64  
 7   week              381965 non-null  object 
 8   year              381965 non-null  int32  
 9   month             381965 non-null  int32  
 10  bestseller_rank   381965 non-null  float64
dtypes: float32(1), float64(1), int32(3), int64(2), int8(1), object(2), uint64(1)
memory usage: 23.7+ MB


In [13]:
candidates_train.data.source.value_counts()

source
popularity    275323
positive       75444
repurchase     31198
Name: count, dtype: int64

In [14]:
from src.feature_extraction import load_optimized_raw_data

In [15]:
transactions_train = load_optimized_raw_data("transactions", "train", 0.05, 42)

2025-05-29 21:41:52 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.05 42
2025-05-29 21:41:52 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.05_42.parquet


In [16]:
transactions_valid = load_optimized_raw_data("transactions", "valid", 0.05, 42)

2025-05-29 21:41:52 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.05 42
2025-05-29 21:41:52 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.05_42.parquet


In [17]:
train_start_date = pd.to_datetime("2020-07-29")

In [18]:
transactions_train = transactions_train.query("t_dat >= '2020-07-29' and t_dat <= '2020-09-08'")
transactions_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84256 entries, 1489202 to 1573457
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   t_dat             84256 non-null  datetime64[ns]
 1   customer_id       84256 non-null  uint64        
 2   article_id        84256 non-null  int32         
 3   price             84256 non-null  float32       
 4   sales_channel_id  84256 non-null  int8          
 5   week_num          84256 non-null  int8          
dtypes: datetime64[ns](1), float32(1), int32(1), int8(2), uint64(1)
memory usage: 2.7 MB


In [19]:
candidates_train.data.query("customer_id == 1773900972412728")

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week_num,source,label,week,year,month,bestseller_rank
2184,1773900972412728,464297007,0.016932,2,97,positive,1,2020-07-29,2020,7,999.0
2185,1773900972412728,554450001,0.033881,2,97,positive,1,2020-07-29,2020,7,999.0
25387,1773900972412728,590928023,0.030492,2,98,positive,1,2020-08-05,2020,8,999.0
25388,1773900972412728,684209019,0.025407,2,98,positive,1,2020-08-05,2020,8,999.0
25389,1773900972412728,776237011,0.025407,2,98,positive,1,2020-08-05,2020,8,999.0
25390,1773900972412728,832114002,0.016932,2,98,positive,1,2020-08-05,2020,8,999.0
25391,1773900972412728,834258001,0.030492,2,98,positive,1,2020-08-05,2020,8,999.0
25392,1773900972412728,834258003,0.030492,2,98,positive,1,2020-08-05,2020,8,999.0
25393,1773900972412728,843872003,0.030492,2,98,positive,1,2020-08-05,2020,8,999.0
30594,1773900972412728,599580072,0.016932,2,99,positive,1,2020-08-12,2020,8,999.0


In [20]:
transactions_train.query("customer_id == 1773900972412728").drop_duplicates(["customer_id", "article_id", "week_num"])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num
1491575,2020-07-30,1773900972412728,464297007,0.016932,2,97
1491576,2020-07-30,1773900972412728,554450001,0.033881,2,97
1517392,2020-08-11,1773900972412728,590928023,0.030492,2,98
1517393,2020-08-11,1773900972412728,684209019,0.025407,2,98
1517394,2020-08-11,1773900972412728,776237011,0.025407,2,98
1517395,2020-08-11,1773900972412728,832114002,0.016932,2,98
1517398,2020-08-11,1773900972412728,834258001,0.030492,2,98
1517400,2020-08-11,1773900972412728,834258003,0.030492,2,98
1517401,2020-08-11,1773900972412728,843872003,0.030492,2,98
1523379,2020-08-13,1773900972412728,599580072,0.016932,2,99


In [21]:
transactions_valid.query("customer_id == 1773900972412728")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num


In [22]:
transactions_valid.customer_id.nunique()

3657

In [23]:
candidates_valid.data.customer_id.nunique()

3657

In [24]:
candidates_valid.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62233 entries, 0 to 62232
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       62233 non-null  uint64 
 1   article_id        62233 non-null  int32  
 2   price             62233 non-null  float32
 3   sales_channel_id  62233 non-null  int8   
 4   week_num          62233 non-null  int64  
 5   source            62233 non-null  object 
 6   label             62233 non-null  int64  
 7   week              62233 non-null  object 
 8   year              62233 non-null  int32  
 9   month             62233 non-null  int32  
 10  bestseller_rank   62233 non-null  float64
dtypes: float32(1), float64(1), int32(3), int64(2), int8(1), object(2), uint64(1)
memory usage: 3.9+ MB


In [25]:
transactions_valid[["customer_id", "article_id", "week_num"]].drop_duplicates().shape

(11558, 3)

In [26]:
candidates_valid.data.source.value_counts()

source
popularity    43667
positive      11558
repurchase     7008
Name: count, dtype: int64

In [27]:
candidates_valid.data_inference.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116834 entries, 0 to 116833
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_id       116834 non-null  uint64 
 1   week_num          116834 non-null  int64  
 2   article_id        116834 non-null  int32  
 3   price             116834 non-null  float32
 4   sales_channel_id  116834 non-null  int8   
 5   source            116834 non-null  object 
 6   week              116834 non-null  object 
 7   year              116834 non-null  int32  
 8   month             116834 non-null  int32  
 9   bestseller_rank   116834 non-null  float64
dtypes: float32(1), float64(1), int32(3), int64(1), int8(1), object(2), uint64(1)
memory usage: 6.4+ MB


In [28]:
# Double check that call the negative examples in candidates_valid.data should be a subset of candidate_valid.data_inference
tmp = candidates_valid.data.merge(
    candidates_valid.data_inference, on=["customer_id", "article_id", "week_num"], how="left", indicator=True
)

In [29]:
# Expect that left_only should have a similar shape as positive examples
# There should be no right_only
tmp._merge.value_counts()

_merge
both          51341
left_only     10892
right_only        0
Name: count, dtype: int64

In [30]:
candidates_valid.data_inference.query("customer_id == 8383252499052781")

Unnamed: 0,customer_id,week_num,article_id,price,sales_channel_id,source,week,year,month,bestseller_rank
0,8383252499052781,103,751471043,0.033157,2,popularity,2020-09-09,2020,9,1.0
1,8383252499052781,103,685813001,0.016932,2,popularity,2020-09-09,2020,9,2.0
2,8383252499052781,103,919365008,0.041627,2,popularity,2020-09-09,2020,9,3.0
3,8383252499052781,103,918292001,0.040811,2,popularity,2020-09-09,2020,9,4.0
4,8383252499052781,103,751471001,0.033245,2,popularity,2020-09-09,2020,9,5.0
5,8383252499052781,103,783346001,0.02488,2,popularity,2020-09-09,2020,9,6.0
6,8383252499052781,103,759814009,0.03301,2,popularity,2020-09-09,2020,9,7.0
7,8383252499052781,103,915526001,0.032994,2,popularity,2020-09-09,2020,9,7.0
8,8383252499052781,103,915529003,0.033311,2,popularity,2020-09-09,2020,9,7.0
9,8383252499052781,103,863595006,0.0328,2,popularity,2020-09-09,2020,9,8.0


In [31]:
transactions_train.query("customer_id == 8383252499052781")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num
1532667,2020-08-19,8383252499052781,554640001,0.029644,1,100
1532668,2020-08-19,8383252499052781,735751005,0.050831,1,100
1532669,2020-08-19,8383252499052781,870434001,0.025407,1,100
1548560,2020-08-27,8383252499052781,772565018,0.010153,1,101


In [32]:
transactions_valid.query("customer_id == 8383252499052781")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num
0,2020-09-09,8383252499052781,865929007,0.016932,1,103
1,2020-09-09,8383252499052781,884319001,0.025407,1,103
2,2020-09-09,8383252499052781,884319003,0.025407,1,103


In [33]:
candidates_valid.data.query("customer_id == 8383252499052781")

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week_num,source,label,week,year,month,bestseller_rank
0,8383252499052781,865929007,0.016932,1,103,positive,1,2020-09-09,2020,9,999.0
1,8383252499052781,884319001,0.025407,1,103,positive,1,2020-09-09,2020,9,999.0
2,8383252499052781,884319003,0.025407,1,103,positive,1,2020-09-09,2020,9,999.0
11558,8383252499052781,751471043,0.033157,2,103,popularity,0,2020-09-09,2020,9,1.0
11559,8383252499052781,685813001,0.016932,2,103,popularity,0,2020-09-09,2020,9,2.0
11560,8383252499052781,919365008,0.041627,2,103,popularity,0,2020-09-09,2020,9,3.0
11561,8383252499052781,918292001,0.040811,2,103,popularity,0,2020-09-09,2020,9,4.0
11562,8383252499052781,751471001,0.033245,2,103,popularity,0,2020-09-09,2020,9,5.0
11563,8383252499052781,783346001,0.02488,2,103,popularity,0,2020-09-09,2020,9,6.0
11564,8383252499052781,759814009,0.03301,2,103,popularity,0,2020-09-09,2020,9,7.0


## Test


In [34]:
transactions_test = load_optimized_raw_data("transactions", "test", 0.05, 42)

2025-05-29 21:41:53 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.05 42
2025-05-29 21:41:53 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.05_42.parquet


In [35]:
transactions_test.info()
print(transactions_test.shape)
print(transactions_test.customer_id.nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12032 entries, 0 to 12031
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   t_dat             12032 non-null  datetime64[ns]
 1   customer_id       12032 non-null  uint64        
 2   article_id        12032 non-null  int32         
 3   price             12032 non-null  float32       
 4   sales_channel_id  12032 non-null  int8          
 5   week_num          12032 non-null  int8          
dtypes: datetime64[ns](1), float32(1), int32(1), int8(2), uint64(1)
memory usage: 305.6 KB
(12032, 6)
3488


In [36]:
candidates_test.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59253 entries, 0 to 59252
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       59253 non-null  uint64 
 1   article_id        59253 non-null  int32  
 2   price             59253 non-null  float32
 3   sales_channel_id  59253 non-null  int8   
 4   week_num          59253 non-null  int64  
 5   source            59253 non-null  object 
 6   label             59253 non-null  int64  
 7   week              59253 non-null  object 
 8   year              59253 non-null  int32  
 9   month             59253 non-null  int32  
 10  bestseller_rank   59253 non-null  float64
dtypes: float32(1), float64(1), int32(3), int64(2), int8(1), object(2), uint64(1)
memory usage: 3.7+ MB


In [37]:
print(candidates_test.data.customer_id.nunique())
print(candidates_test.data_inference.customer_id.nunique())

3488
3488


In [38]:
print(candidates_test.data.source.value_counts())
print(candidates_test.data_inference.source.value_counts())

source
popularity    41589
positive      10794
repurchase     6870
Name: count, dtype: int64
source
popularity    104640
repurchase      6962
Name: count, dtype: int64


In [39]:
tmp_test = candidates_test.data.merge(
    candidates_test.data_inference, on=["customer_id", "article_id", "week_num"], how="left", indicator=True
)

In [40]:
tmp_test._merge.value_counts()

_merge
both          49219
left_only     10034
right_only        0
Name: count, dtype: int64

In [41]:
candidates_test.data_inference.head()

Unnamed: 0,customer_id,week_num,article_id,price,sales_channel_id,source,week,year,month,bestseller_rank
0,81250068469314753,104,909370001,0.032219,1,popularity,2020-09-16,2020,9,1.0
1,81250068469314753,104,865799006,0.033024,2,popularity,2020-09-16,2020,9,2.0
2,81250068469314753,104,915529003,0.033419,2,popularity,2020-09-16,2020,9,3.0
3,81250068469314753,104,918522001,0.041087,2,popularity,2020-09-16,2020,9,3.0
4,81250068469314753,104,924243001,0.040897,2,popularity,2020-09-16,2020,9,4.0


In [42]:
transactions_test.query("customer_id ==81250068469314753	")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num
0,2020-09-16,81250068469314753,564311002,0.025407,2,104
1,2020-09-16,81250068469314753,845790001,0.033881,2,104
2,2020-09-16,81250068469314753,881919002,0.025407,2,104


In [43]:
candidates_test.data.query("customer_id == 81250068469314753")

Unnamed: 0,customer_id,article_id,price,sales_channel_id,week_num,source,label,week,year,month,bestseller_rank
0,81250068469314753,564311002,0.025407,2,104,positive,1,2020-09-16,2020,9,999.0
1,81250068469314753,845790001,0.033881,2,104,positive,1,2020-09-16,2020,9,999.0
2,81250068469314753,881919002,0.025407,2,104,positive,1,2020-09-16,2020,9,999.0
10794,81250068469314753,909370001,0.032219,1,104,popularity,0,2020-09-16,2020,9,1.0
10795,81250068469314753,865799006,0.033024,2,104,popularity,0,2020-09-16,2020,9,2.0
10796,81250068469314753,915529003,0.033419,2,104,popularity,0,2020-09-16,2020,9,3.0
10797,81250068469314753,918522001,0.041087,2,104,popularity,0,2020-09-16,2020,9,3.0
10798,81250068469314753,924243001,0.040897,2,104,popularity,0,2020-09-16,2020,9,4.0
10799,81250068469314753,751471001,0.03339,2,104,popularity,0,2020-09-16,2020,9,5.0
10800,81250068469314753,762846027,0.024691,2,104,popularity,0,2020-09-16,2020,9,6.0


In [44]:
transactions_valid.query("customer_id == 81250068469314753")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num
2126,2020-09-10,81250068469314753,564311002,0.025407,2,103
2127,2020-09-10,81250068469314753,570003078,0.013542,2,103
2128,2020-09-10,81250068469314753,713380030,0.025407,2,103
2129,2020-09-10,81250068469314753,845790001,0.033881,2,103
2130,2020-09-10,81250068469314753,874167002,0.033881,2,103


: 