In [1]:
# --- Imports ---
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm

In [2]:
# --- Load Data ---
df = pd.read_csv('../data/processed/ecommerce_sales_featured.csv')

In [4]:
# --- Load DistilBERT Model & Tokenizer ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [5]:
# Ensure model runs on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [6]:
# --- Encode Product Names into Embeddings ---
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=16)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # CLS token representation
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

In [7]:
# Apply to all product names
embeddings = []
for name in tqdm(df['product_name'], desc="Encoding product names"):
    embeddings.append(get_embedding(str(name)))

# Convert to DataFrame
emb_df = pd.DataFrame(embeddings, columns=[f'emb_{i}' for i in range(embeddings[0].shape[0])])

# Combine with original df (drop product_name to avoid leakage)
df_combined = pd.concat([df.drop(columns=['product_name']), emb_df], axis=1)

# --- Save Combined Features ---
df_combined.to_csv('../data/processed/ecommerce_sales_with_embeddings.csv', index=False)

Encoding product names:   0%|          | 0/1000 [00:00<?, ?it/s]