# 1a Articles dynamic features


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.core_utils import setup_logging

In [3]:
# Initialize logging
file_log = "articles_dynamic_features.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

<RootLogger root (DEBUG)>

In [4]:
from src.features_articles import ArticleDynamicFeaturePipeline, ArticleDynamicFeaturePipelineConfig
from src.features_articles import load_optimized_raw_data

# Run pipeline


In [5]:
articles_dynamic_features_config = ArticleDynamicFeaturePipelineConfig.create_default()

In [6]:
articles_dynamic_feature_pipeline = ArticleDynamicFeaturePipeline(articles_dynamic_features_config)

In [7]:
articles_dynamic_feature_pipeline.setup()

2025-05-30 11:22:51 - src.features_articles - INFO - Setting up ArticleDynamicFeaturePipeline
2025-05-30 11:22:51 - src.features_articles - DEBUG - Config: {
  "config_processor": {
    "encoding_strategy": "ordinal",
    "categorical_features": [],
    "numerical_features": [
      "weekly_sales_count",
      "weekly_avg_price",
      "cumulative_mean_age",
      "cumulative_sales_count"
    ],
    "one_hot_features": [],
    "start_week_num": 52,
    "end_week_num": 104
  },
  "subsample": 0.05,
  "seed": 42
}


<src.features_articles.ArticleDynamicFeaturePipeline at 0x13e8265d0>

In [8]:
results_articles_dynamic = articles_dynamic_feature_pipeline.run()

2025-05-30 11:22:51 - src.features_articles - INFO - Loading data for ArticleDynamicFeaturePipeline
2025-05-30 11:22:51 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.05 42
2025-05-30 11:22:51 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.05_42.parquet
2025-05-30 11:22:52 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.05 42
2025-05-30 11:22:52 - src.feature_extraction - DEBUG - Loading data from ../data/articles_sample_0.05_42.parquet
2025-05-30 11:22:52 - src.feature_extraction - INFO - Loading optimized raw data from customers train 0.05 42
2025-05-30 11:22:52 - src.feature_extraction - DEBUG - Loading data from ../data/customers_sample_0.05_42.parquet
2025-05-30 11:22:52 - src.features_articles - INFO - Processing article dynamic features
2025-05-30 11:22:52 - src.features_articles - INFO - Generating a cross join of articles and weeks from 52 to 104
2025-05-30 11:2

# Debug


In [9]:
print(results_articles_dynamic.data.shape)
print(results_articles_dynamic.data.columns)
print(results_articles_dynamic.data.isnull().sum())
display(results_articles_dynamic.data.head())

(4101246, 6)
Index(['article_id', 'week_num', 'weekly_sales_count', 'weekly_avg_price',
       'cumulative_mean_age', 'cumulative_sales_count'],
      dtype='object')
article_id                0
week_num                  0
weekly_sales_count        0
weekly_avg_price          0
cumulative_mean_age       0
cumulative_sales_count    0
dtype: int64


Unnamed: 0,article_id,week_num,weekly_sales_count,weekly_avg_price,cumulative_mean_age,cumulative_sales_count
0,108775015,52,0.0,0.0,32.0,0.0
1,108775015,53,0.0,0.0,32.0,0.0
2,108775015,54,0.0,0.0,32.0,0.0
3,108775015,55,0.0,0.0,32.0,0.0
4,108775015,56,0.0,0.0,32.0,0.0


In [10]:
# Load raw data
articles = load_optimized_raw_data(data_type="articles", subsample=0.05, seed=42)
transactions_train = load_optimized_raw_data(data_type="transactions", sample="train", subsample=0.05, seed=42)
transactions_valid = load_optimized_raw_data(data_type="transactions", sample="valid", subsample=0.05, seed=42)
transactions_test = load_optimized_raw_data(data_type="transactions", sample="test", subsample=0.05, seed=42)
customers = load_optimized_raw_data(data_type="customers", subsample=0.05, seed=42)
transactions = pd.concat([transactions_train, transactions_valid, transactions_test], axis=0, ignore_index=True)

2025-05-30 11:22:53 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.05 42
2025-05-30 11:22:53 - src.feature_extraction - DEBUG - Loading data from ../data/articles_sample_0.05_42.parquet
2025-05-30 11:22:53 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.05 42
2025-05-30 11:22:53 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.05_42.parquet
2025-05-30 11:22:53 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.05 42
2025-05-30 11:22:53 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.05_42.parquet
2025-05-30 11:22:53 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.05 42
2025-05-30 11:22:53 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.05_42.parquet
2025-05-30 11:22:53 - src.feature_extraction - INFO - Loading optimized 

In [11]:
# Since we calculate cumulative stats with a given window, it's possible that some articles are not present in the data for the entire window.
results_articles_dynamic.data.head(29)

Unnamed: 0,article_id,week_num,weekly_sales_count,weekly_avg_price,cumulative_mean_age,cumulative_sales_count
0,108775015,52,0.0,0.0,32.0,0.0
1,108775015,53,0.0,0.0,32.0,0.0
2,108775015,54,0.0,0.0,32.0,0.0
3,108775015,55,0.0,0.0,32.0,0.0
4,108775015,56,0.0,0.0,32.0,0.0
5,108775015,57,0.0,0.0,32.0,0.0
6,108775015,58,0.0,0.0,32.0,0.0
7,108775015,59,0.0,0.0,32.0,0.0
8,108775015,60,0.0,0.0,32.0,0.0
9,108775015,61,0.0,0.0,32.0,0.0


In [12]:
transactions.query("article_id == 108775015")

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week_num
1105,2018-09-20,9697567767184863887,108775015,0.008458,1,0
1106,2018-09-20,9697567767184863887,108775015,0.008458,1,0
4515,2018-09-21,16660116983016697586,108775015,0.007712,1,0
5272,2018-09-22,10033972236183063637,108775015,0.008458,1,0
9642,2018-09-24,11035824322183416948,108775015,0.007610,1,0
...,...,...,...,...,...,...
734830,2019-07-26,13363514939111444301,108775015,0.006797,1,44
993662,2019-12-02,1118797876156724048,108775015,0.008458,2,62
1005495,2019-12-10,2601062256681639780,108775015,0.008458,1,63
1042695,2019-12-30,2762924158672000374,108775015,0.004390,1,66


In [13]:
results_articles_dynamic.data.article_id.nunique()

77382

In [14]:
articles.shape

(77382, 25)

In [15]:
transactions.week_num.min(), transactions.week_num.max()

(0, 104)

In [16]:
transactions.article_id.nunique()

77382

: 