# 01 Feature preprocessing


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

In [3]:
# Initialize logging
file_log = "feature_preprocessing.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

2025-06-05 12:20:40,333 - src.utils.core_utils - INFO - Creating file handler with level: 10
2025-06-05 12:20:40,334 - src.utils.core_utils - DEBUG - Logging setup complete to feature_preprocessing.log


In [4]:
from src.feature_customers import (
    CustomerStaticFeaturePipeline,
    CustomerStaticFeaturePipelineConfig,
)
from src.features_articles import (
    ArticleStaticFeaturePipeline,
    ArticleStaticFeaturePipelineConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


# Customers Static


In [5]:
customer_feature_static_config = CustomerStaticFeaturePipelineConfig.create_default()

In [6]:
customer_feature_pipeline = CustomerStaticFeaturePipeline(config=customer_feature_static_config)

In [7]:
customer_feature_pipeline = customer_feature_pipeline.setup()

2025-06-05 12:20:50,072 - src.feature_customers - INFO - Setting up CustomerFeaturePipeline with config:
2025-06-05 12:20:50,073 - src.feature_customers - DEBUG - {
  "config_processor": {
    "age_bins": [
      -Infinity,
      18,
      25,
      35,
      45,
      55,
      65,
      Infinity
    ],
    "keep_numeric_age": true,
    "missing_value_strategy": "fill_unknown",
    "missing_values_map": {
      "fn": 0,
      "active": 0,
      "club_member_status": "unknown",
      "fashion_news_frequency": "unknown",
      "postal_code": "unknown"
    },
    "encoding_strategy": "ordinal",
    "categorical_features": [
      "club_member_status",
      "fashion_news_frequency",
      "postal_code",
      "age_bin"
    ],
    "numerical_features": [
      "fn",
      "active",
      "age"
    ],
    "one_hot_features": []
  },
  "subsample": 0.25,
  "seed": 42
}


In [8]:
results_customers = customer_feature_pipeline.run()

2025-06-05 12:20:50,093 - src.feature_customers - INFO - Loading raw customer data for CustomerFeaturePipeline
2025-06-05 12:20:50,094 - src.feature_extraction - INFO - Loading optimized raw data from customers train 0.25 42
2025-06-05 12:20:50,094 - src.feature_extraction - DEBUG - Loading data from ../data/customers_sample_0.25_42.parquet
2025-06-05 12:20:50,434 - src.feature_customers - DEBUG - Loaded raw customer data with shape: (342995, 7)
2025-06-05 12:20:50,435 - src.feature_customers - INFO - Processing customer data
2025-06-05 12:20:50,441 - src.feature_customers - DEBUG - Standardizing column names
2025-06-05 12:20:50,442 - src.feature_customers - DEBUG - Handling missing values
2025-06-05 12:20:50,489 - src.feature_customers - DEBUG - Creating demographic features
2025-06-05 12:20:50,489 - src.feature_customers - INFO - Processing age feature
2025-06-05 12:20:50,619 - src.feature_customers - DEBUG - Applying ordinal encoding
2025-06-05 12:20:51,343 - src.feature_customers -

In [9]:
results_customers.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342995 entries, 0 to 342994
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   customer_id             342995 non-null  uint64 
 1   fn                      342995 non-null  int8   
 2   active                  342995 non-null  int8   
 3   club_member_status      342995 non-null  float64
 4   fashion_news_frequency  342995 non-null  float64
 5   age                     342995 non-null  float64
 6   postal_code             342995 non-null  float64
 7   age_bin                 342995 non-null  float64
dtypes: float64(5), int8(2), uint64(1)
memory usage: 16.4 MB


In [10]:
results_customers.get_feature_list()

['fn',
 'active',
 'age',
 'club_member_status',
 'fashion_news_frequency',
 'postal_code',
 'age_bin',
 'customer_id']

In [11]:
for col in results_customers.data.columns:
    if col.startswith("customer_"):
        continue
    print(f"{col}: {results_customers.data[col].nunique()} unique values")
    display(results_customers.data[col].value_counts().head(10))

fn: 2 unique values


fn
0    223537
1    119458
Name: count, dtype: int64

active: 2 unique values


active
0    226704
1    116291
Name: count, dtype: int64

club_member_status: 4 unique values


club_member_status
0.0    318245
2.0     23168
3.0      1467
1.0       115
Name: count, dtype: int64

fashion_news_frequency: 4 unique values


fashion_news_frequency
1.0    219191
2.0    119587
3.0      4002
0.0       215
Name: count, dtype: int64

age: 82 unique values


age
21.0    16986
24.0    14062
20.0    13830
23.0    13715
25.0    13710
26.0    13413
22.0    13015
27.0    12137
28.0    11168
32.0    10955
Name: count, dtype: int64

postal_code: 192045 unique values


postal_code
33158.0     30104
153488.0       71
118329.0       46
57861.0        45
85019.0        41
23533.0        38
68623.0        38
121515.0       36
173957.0       35
93331.0        35
Name: count, dtype: int64

age_bin: 8 unique values


age_bin
1.0    94708
2.0    89587
4.0    64347
3.0    41937
5.0    30949
6.0     9214
0.0     8398
7.0     3855
Name: count, dtype: int64

# Articles Static


In [12]:
article_feature_static_config = ArticleStaticFeaturePipelineConfig.create_default()

In [13]:
# Create pipeline with default configuration
article_pipeline = ArticleStaticFeaturePipeline(article_feature_static_config)

In [14]:
# Setup and run
article_pipeline.setup()
results_articles = article_pipeline.run()

2025-06-05 12:20:51,693 - src.features_articles - INFO - Setting up ArticleFeaturePipeline with config:
2025-06-05 12:20:51,694 - src.features_articles - DEBUG - {
  "config_processor": {
    "encoding_strategy": "ordinal",
    "categorical_features": [
      "product_type_no",
      "graphical_appearance_no",
      "colour_group_code",
      "perceived_colour_value_id",
      "perceived_colour_master_id",
      "department_no",
      "index_code",
      "index_group_no",
      "section_no",
      "garment_group_no"
    ],
    "numerical_features": [],
    "one_hot_features": []
  },
  "subsample": 0.25,
  "seed": 42
}
2025-06-05 12:20:51,695 - src.features_articles - INFO - Loading raw article data for ArticleFeaturePipeline
2025-06-05 12:20:51,695 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.25 42
2025-06-05 12:20:51,695 - src.feature_extraction - DEBUG - Loading data from ../data/articles_sample_0.25_42.parquet
2025-06-05 12:20:51,754 - src.feat

In [15]:
results_articles.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95295 entries, 0 to 95294
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   article_id                    95295 non-null  int32  
 1   product_code                  95295 non-null  int32  
 2   prod_name                     95295 non-null  object 
 3   product_type_no               95295 non-null  float64
 4   product_type_name             95295 non-null  object 
 5   product_group_name            95295 non-null  object 
 6   graphical_appearance_no       95295 non-null  float64
 7   graphical_appearance_name     95295 non-null  object 
 8   colour_group_code             95295 non-null  float64
 9   colour_group_name             95295 non-null  object 
 10  perceived_colour_value_id     95295 non-null  float64
 11  perceived_colour_value_name   95295 non-null  object 
 12  perceived_colour_master_id    95295 non-null  float64
 13  p

In [16]:
results_articles.get_feature_list()

['product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_code',
 'index_group_no',
 'section_no',
 'garment_group_no',
 'article_id']