# 1a Articles dynamic features


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

In [3]:
# Initialize logging
file_log = "articles_dynamic_features.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

2025-06-05 12:21:11,682 - src.utils.core_utils - INFO - Creating file handler with level: 10
2025-06-05 12:21:11,682 - src.utils.core_utils - DEBUG - Logging setup complete to articles_dynamic_features.log


In [4]:
from src.features_articles import ArticleDynamicFeaturePipeline, ArticleDynamicFeaturePipelineConfig
from src.features_articles import load_optimized_raw_data

  from .autonotebook import tqdm as notebook_tqdm


# Run pipeline


In [5]:
articles_dynamic_features_config = ArticleDynamicFeaturePipelineConfig.create_default()

In [6]:
articles_dynamic_feature_pipeline = ArticleDynamicFeaturePipeline(articles_dynamic_features_config)

In [7]:
articles_dynamic_feature_pipeline.setup()

2025-06-05 12:21:14,310 - src.features_articles - INFO - Setting up ArticleDynamicFeaturePipeline
2025-06-05 12:21:14,311 - src.features_articles - DEBUG - Config: {
  "config_processor": {
    "encoding_strategy": "ordinal",
    "categorical_features": [],
    "numerical_features": [
      "weekly_sales_count",
      "weekly_avg_price",
      "cumulative_mean_age",
      "cumulative_sales_count"
    ],
    "one_hot_features": [],
    "start_week_num": 92,
    "end_week_num": 104,
    "history_start_week_num": 52,
    "history_end_week_num": 104
  },
  "subsample": 0.25,
  "seed": 42
}


<src.features_articles.ArticleDynamicFeaturePipeline at 0x15b302720>

In [8]:
results_articles_dynamic = articles_dynamic_feature_pipeline.run()

2025-06-05 12:21:14,333 - src.features_articles - INFO - Loading data for ArticleDynamicFeaturePipeline
2025-06-05 12:21:14,334 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.25 42
2025-06-05 12:21:14,334 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.25_42.parquet
2025-06-05 12:21:14,535 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.25 42
2025-06-05 12:21:14,535 - src.feature_extraction - DEBUG - Loading data from ../data/articles_sample_0.25_42.parquet
2025-06-05 12:21:14,582 - src.feature_extraction - INFO - Loading optimized raw data from customers train 0.25 42
2025-06-05 12:21:14,582 - src.feature_extraction - DEBUG - Loading data from ../data/customers_sample_0.25_42.parquet
2025-06-05 12:21:14,658 - src.features_articles - INFO - Processing article dynamic features
2025-06-05 12:21:14,821 - src.features_articles - INFO - Generating a cross join of articles and 

# Checks


In [9]:
from src.utils.data_checks import test_feature_article_dynamic_feature_pipeline, test_feature_article_dynamic_feature

2025-06-05 12:21:17,536 - matplotlib - DEBUG - matplotlib data path: /Users/sharonshiao/Dropbox/Machine_learning/projects/fahsion_recommendation/venv/lib/python3.12/site-packages/matplotlib/mpl-data
2025-06-05 12:21:17,541 - matplotlib - DEBUG - CONFIGDIR=/Users/sharonshiao/.matplotlib
2025-06-05 12:21:17,580 - matplotlib - DEBUG - interactive is False
2025-06-05 12:21:17,581 - matplotlib - DEBUG - platform is darwin
2025-06-05 12:21:17,708 - matplotlib - DEBUG - CACHEDIR=/Users/sharonshiao/.matplotlib
2025-06-05 12:21:17,715 - matplotlib.font_manager - DEBUG - Using fontManager instance from /Users/sharonshiao/.matplotlib/fontlist-v390.json


In [10]:
print(results_articles_dynamic.data.shape)
print(results_articles_dynamic.data.columns)
print(results_articles_dynamic.data.isnull().sum())
display(results_articles_dynamic.data.head())

(1238835, 6)
Index(['article_id', 'week_num', 'weekly_sales_count', 'weekly_avg_price',
       'cumulative_mean_age', 'cumulative_sales_count'],
      dtype='object')
article_id                0
week_num                  0
weekly_sales_count        0
weekly_avg_price          0
cumulative_mean_age       0
cumulative_sales_count    0
dtype: int64


Unnamed: 0,article_id,week_num,weekly_sales_count,weekly_avg_price,cumulative_mean_age,cumulative_sales_count
0,108775015,92,0.0,0.024174,32.0,0.0
1,108775015,93,0.0,0.024904,32.0,0.0
2,108775015,94,0.0,0.024781,32.0,0.0
3,108775015,95,0.0,0.024747,32.0,0.0
4,108775015,96,0.0,0.024882,32.0,0.0


In [11]:
# Load raw data
subsample = articles_dynamic_features_config.subsample
seed = articles_dynamic_features_config.seed
articles = load_optimized_raw_data(data_type="articles", subsample=subsample, seed=seed)
transactions_train = load_optimized_raw_data(data_type="transactions", sample="train", subsample=subsample, seed=seed)
transactions_valid = load_optimized_raw_data(data_type="transactions", sample="valid", subsample=subsample, seed=seed)
transactions_test = load_optimized_raw_data(data_type="transactions", sample="test", subsample=subsample, seed=seed)
customers = load_optimized_raw_data(data_type="customers", subsample=subsample, seed=seed)
transactions = pd.concat([transactions_train, transactions_valid, transactions_test], axis=0, ignore_index=True)
del transactions_train, transactions_valid, transactions_test

2025-06-05 12:21:18,336 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.25 42
2025-06-05 12:21:18,337 - src.feature_extraction - DEBUG - Loading data from ../data/articles_sample_0.25_42.parquet
2025-06-05 12:21:18,382 - src.feature_extraction - INFO - Loading optimized raw data from transactions train 0.25 42
2025-06-05 12:21:18,383 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_train_sample_0.25_42.parquet
2025-06-05 12:21:18,460 - src.feature_extraction - INFO - Loading optimized raw data from transactions valid 0.25 42
2025-06-05 12:21:18,461 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_valid_sample_0.25_42.parquet
2025-06-05 12:21:18,470 - src.feature_extraction - INFO - Loading optimized raw data from transactions test 0.25 42
2025-06-05 12:21:18,470 - src.feature_extraction - DEBUG - Loading data from ../data/transactions_test_sample_0.25_42.parquet
2025-06-05 12:21:18,486 - src.feature_ex

In [12]:
test_feature_article_dynamic_feature_pipeline(
    results_articles_dynamic=results_articles_dynamic,
    articles=articles,
    pipeline_config=articles_dynamic_features_config,
)

Metadata checks passed


In [13]:
test_feature_article_dynamic_feature(
    results_articles_dynamic=results_articles_dynamic,
    transactions=transactions,
    customers=customers,
    pipeline_config=articles_dynamic_features_config,
    article_ids=results_articles_dynamic.data.article_id.unique()[:5],
)

Article ID 108775015
--------------------------------------------------------------------------------
Article ID 108775044
Week number 92
Numbers matched for week 92
Week number 93
Numbers matched for week 93
Week number 94
Numbers matched for week 94
Week number 98
Numbers matched for week 98
Week number 99
Numbers matched for week 99
Week number 100
Numbers matched for week 100
Week number 101
Numbers matched for week 101
--------------------------------------------------------------------------------
Article ID 108775051
--------------------------------------------------------------------------------
Article ID 110065001
Week number 92
Numbers matched for week 92
Week number 93
Numbers matched for week 93
--------------------------------------------------------------------------------
Article ID 110065002
Week number 94
Numbers matched for week 94
Week number 95
Numbers matched for week 95
--------------------------------------------------------------------------------
