# 01 Feature preprocessing


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

In [3]:
# Initialize logging
file_log = "feature_preprocessing.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

2025-06-03 16:49:09,212 - src.utils.core_utils - INFO - Creating file handler with level: 10
2025-06-03 16:49:09,212 - src.utils.core_utils - DEBUG - Logging setup complete to feature_preprocessing.log


In [4]:
from src.feature_customers import (
    CustomerStaticFeaturePipeline,
    CustomerStaticFeaturePipelineConfig,
)
from src.features_articles import (
    ArticleStaticFeaturePipeline,
    ArticleStaticFeaturePipelineConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


# Customers Static


In [5]:
customer_feature_static_config = CustomerStaticFeaturePipelineConfig.create_default()

In [6]:
customer_feature_pipeline = CustomerStaticFeaturePipeline(config=customer_feature_static_config)

In [7]:
customer_feature_pipeline = customer_feature_pipeline.setup()

2025-06-03 16:49:11,964 - src.feature_customers - INFO - Setting up CustomerFeaturePipeline with config:
2025-06-03 16:49:11,964 - src.feature_customers - DEBUG - {
  "config_processor": {
    "age_bins": [
      -Infinity,
      18,
      25,
      35,
      45,
      55,
      65,
      Infinity
    ],
    "keep_numeric_age": true,
    "missing_value_strategy": "fill_unknown",
    "missing_values_map": {
      "fn": 0,
      "active": 0,
      "club_member_status": "unknown",
      "fashion_news_frequency": "unknown",
      "postal_code": "unknown"
    },
    "encoding_strategy": "ordinal",
    "categorical_features": [
      "club_member_status",
      "fashion_news_frequency",
      "postal_code",
      "age_bin"
    ],
    "numerical_features": [
      "fn",
      "active",
      "age"
    ],
    "one_hot_features": []
  },
  "subsample": 0.05,
  "seed": 42
}


In [8]:
results_customers = customer_feature_pipeline.run()

2025-06-03 16:49:11,982 - src.feature_customers - INFO - Loading raw customer data for CustomerFeaturePipeline
2025-06-03 16:49:11,982 - src.feature_extraction - INFO - Loading optimized raw data from customers train 0.05 42
2025-06-03 16:49:11,983 - src.feature_extraction - DEBUG - Loading data from ../data/customers_sample_0.05_42.parquet
2025-06-03 16:49:12,106 - src.feature_customers - DEBUG - Loaded raw customer data with shape: (68599, 7)
2025-06-03 16:49:12,107 - src.feature_customers - INFO - Processing customer data
2025-06-03 16:49:12,108 - src.feature_customers - DEBUG - Standardizing column names
2025-06-03 16:49:12,109 - src.feature_customers - DEBUG - Handling missing values
2025-06-03 16:49:12,118 - src.feature_customers - DEBUG - Creating demographic features
2025-06-03 16:49:12,119 - src.feature_customers - INFO - Processing age feature
2025-06-03 16:49:12,147 - src.feature_customers - DEBUG - Applying ordinal encoding
2025-06-03 16:49:12,281 - src.feature_customers - 

In [9]:
results_customers.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68599 entries, 0 to 68598
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             68599 non-null  uint64 
 1   fn                      68599 non-null  int8   
 2   active                  68599 non-null  int8   
 3   club_member_status      68599 non-null  float64
 4   fashion_news_frequency  68599 non-null  float64
 5   age                     68599 non-null  float64
 6   postal_code             68599 non-null  float64
 7   age_bin                 68599 non-null  float64
dtypes: float64(5), int8(2), uint64(1)
memory usage: 3.3 MB


In [10]:
results_customers.get_feature_list()

['fn',
 'active',
 'age',
 'club_member_status',
 'fashion_news_frequency',
 'postal_code',
 'age_bin',
 'customer_id']

In [11]:
for col in results_customers.data.columns:
    if col.startswith("customer_"):
        continue
    print(f"{col}: {results_customers.data[col].nunique()} unique values")
    display(results_customers.data[col].value_counts().head(10))

fn: 2 unique values


fn
0    44700
1    23899
Name: count, dtype: int64

active: 2 unique values


active
0    45352
1    23247
Name: count, dtype: int64

club_member_status: 4 unique values


club_member_status
0.0    63599
2.0     4669
3.0      311
1.0       20
Name: count, dtype: int64

fashion_news_frequency: 4 unique values


fashion_news_frequency
1.0    43870
2.0    23912
3.0      773
0.0       44
Name: count, dtype: int64

age: 78 unique values


age
21.0    3341
24.0    2878
25.0    2760
20.0    2753
23.0    2678
26.0    2660
22.0    2533
27.0    2441
28.0    2234
32.0    2134
Name: count, dtype: int64

postal_code: 55304 unique values


postal_code
9595.0     5960
44144.0      12
11060.0      11
6805.0       11
34995.0      10
20638.0       9
9290.0        9
4896.0        9
16321.0       8
37963.0       8
Name: count, dtype: int64

age_bin: 8 unique values


age_bin
1.0    18888
2.0    17793
4.0    12803
3.0     8482
5.0     6265
6.0     1876
0.0     1732
7.0      760
Name: count, dtype: int64

# Articles Static


In [12]:
article_feature_static_config = ArticleStaticFeaturePipelineConfig.create_default()

In [13]:
# Create pipeline with default configuration
article_pipeline = ArticleStaticFeaturePipeline(article_feature_static_config)

In [14]:
# Setup and run
article_pipeline.setup()
results_articles = article_pipeline.run()

2025-06-03 16:49:12,443 - src.features_articles - INFO - Setting up ArticleFeaturePipeline with config:
2025-06-03 16:49:12,444 - src.features_articles - DEBUG - {
  "config_processor": {
    "encoding_strategy": "ordinal",
    "categorical_features": [
      "product_type_no",
      "graphical_appearance_no",
      "colour_group_code",
      "perceived_colour_value_id",
      "perceived_colour_master_id",
      "department_no",
      "index_code",
      "index_group_no",
      "section_no",
      "garment_group_no"
    ],
    "numerical_features": [],
    "one_hot_features": []
  },
  "subsample": 0.05,
  "seed": 42
}
2025-06-03 16:49:12,444 - src.features_articles - INFO - Loading raw article data for ArticleFeaturePipeline
2025-06-03 16:49:12,445 - src.feature_extraction - INFO - Loading optimized raw data from articles train 0.05 42
2025-06-03 16:49:12,445 - src.feature_extraction - DEBUG - Loading data from ../data/articles_sample_0.05_42.parquet
2025-06-03 16:49:12,493 - src.feat

In [15]:
results_articles.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77382 entries, 0 to 77381
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   article_id                    77382 non-null  int32  
 1   product_code                  77382 non-null  int32  
 2   prod_name                     77382 non-null  object 
 3   product_type_no               77382 non-null  float64
 4   product_type_name             77382 non-null  object 
 5   product_group_name            77382 non-null  object 
 6   graphical_appearance_no       77382 non-null  float64
 7   graphical_appearance_name     77382 non-null  object 
 8   colour_group_code             77382 non-null  float64
 9   colour_group_name             77382 non-null  object 
 10  perceived_colour_value_id     77382 non-null  float64
 11  perceived_colour_value_name   77382 non-null  object 
 12  perceived_colour_master_id    77382 non-null  float64
 13  p

In [16]:
results_articles.get_feature_list()

['product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_code',
 'index_group_no',
 'section_no',
 'garment_group_no',
 'article_id']