# 1b Articles embedding


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import logging
import pandas as pd

sys.path.append("..")

from src.utils.core_utils import setup_logging

# Initialize logging
file_log = "articles_embeddings.log"
root_logger = setup_logging(level=logging.DEBUG, log_file=file_log, remove_existing=True)

2025-05-31 16:06:51,493 - src.utils - INFO - Creating file handler with level: 10
2025-05-31 16:06:51,494 - src.utils - DEBUG - Logging setup complete to articles_embeddings.log


In [3]:
from src.feature_extraction import load_optimized_raw_data
from src.utils.utils_torch import get_device, set_seed
from src.features_articles import ArticleEmbeddingPipelineConfig, ArticleEmbeddingPipeline

  from .autonotebook import tqdm as notebook_tqdm


# Run pipeline


In [4]:
set_seed(42)
device = get_device()

In [5]:
config = ArticleEmbeddingPipelineConfig.create_default()
# Update device
config.config_processor["device"] = device.type

In [6]:
article_embedding_pipeline = ArticleEmbeddingPipeline(config)

In [7]:
article_embedding_pipeline.setup()

2025-05-31 16:06:55,939 - src.features_articles - INFO - Setting up ArticleEmbeddingPipeline
2025-05-31 16:06:55,939 - src.features_articles - DEBUG - Config: {
  "config_processor": {
    "device_type": "mps",
    "batch_size": 32,
    "text_model_id": "distilbert-base-uncased",
    "img_model_id": "resnet18",
    "cols_text": [
      "prod_name",
      "product_type_name",
      "product_group_name",
      "graphical_appearance_name",
      "colour_group_name",
      "perceived_colour_value_name",
      "perceived_colour_master_name",
      "department_name",
      "index_name",
      "index_group_name",
      "section_name",
      "garment_group_name",
      "detail_desc"
    ],
    "device": "mps"
  },
  "subsample": 1.0,
  "seed": 42
}
2025-05-31 16:06:55,940 - src.features_articles - INFO - Using device: mps
2025-05-31 16:06:55,940 - src.features_articles - INFO - Loading text model: distilbert-base-uncased
2025-05-31 16:06:55,943 - urllib3.connectionpool - DEBUG - Starting new H

<src.features_articles.ArticleEmbeddingPipeline at 0x15f21b500>

In [8]:
res_embeddings = article_embedding_pipeline.run()

2025-05-31 16:06:56,897 - src.features_articles - INFO - Loading data for ArticleEmbeddingPipeline
2025-05-31 16:06:56,898 - src.feature_extraction - INFO - Loading optimized raw data from articles train 1.0 42
2025-05-31 16:06:56,898 - src.feature_extraction - DEBUG - Loading data from ../data/articles.parquet
2025-05-31 16:06:57,064 - src.features_articles - DEBUG - Loaded article data with shape: (105542, 25)
2025-05-31 16:06:57,064 - src.features_articles - INFO - Processing article embeddings
2025-05-31 16:06:57,065 - src.features_articles - INFO - Generating text embeddings
2025-05-31 16:06:57,066 - src.features_articles - INFO - Generating text embeddings
Processing text batches: 100%|██████████| 3299/3299 [04:35<00:00, 11.99it/s]
2025-05-31 16:11:32,514 - src.features_articles - INFO - Generated 105542 text embeddings
2025-05-31 16:11:32,960 - src.features_articles - INFO - Text embeddings shape: (105542, 768)
2025-05-31 16:11:33,343 - src.features_articles - INFO - Generating 

# Checks


In [9]:
print(res_embeddings.text_embeddings.shape)
print(res_embeddings.image_embeddings.shape)
print("Number of articles with missing image: ", res_embeddings.image_missing.sum())
print(len(res_embeddings.id_to_index))

(105542, 768)
(105542, 512)
Number of articles with missing image:  442
105542


In [10]:
# Show articles with missing image
for id_ in res_embeddings.index_to_id.keys():
    if res_embeddings.image_missing[res_embeddings.index_to_id[id_]]:
        print(id_)

174057028
179208001
212042043
212042066
212629004
215324023
216961011
218354021
241602023
268305006
272591001
324963030
348657006
357792006
364409037
369423002
369423004
388916001
395864048
396135047
396135048
396690010
397376010
398947001
405310006
408875001
408875002
408875010
411565004
419634007
419634010
419634013
420264002
425683012
438702006
442786001
443262014
446224011
446224013
453065011
460363012
461414009
461414010
468666002
469039021
469658014
470985008
470985010
475791011
475827007
480076004
481777003
481797022
481797026
481797027
484864002
485673022
485678030
485678032
485973011
486639004
487926002
489758004
490275001
490473012
491912034
494030013
496762004
499802028
504152001
504152008
504960001
504960003
506177002
507399003
508931033
510415003
510504001
511653001
512300003
512964002
513701002
515074002
516550001
516898002
517724002
517729011
517729015
517729030
519219008
519815003
519987002
520363002
521272004
521393002
522725001
523518001
525111001
525460003
527210001
