In [3]:
# !pip install transformers
# !pip install sentence_transformers

In [4]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from transformers import RobertaModel, RobertaTokenizer
from sentence_transformers import SentenceTransformer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
# Get the current directory
current_dir = Path.cwd()
encoded_dir = current_dir.parent / "data" / "encoded"
metadata_file_path = current_dir.parent / "data" / "data_and_test_files" / "items_metadata.jsonl"
pre_path = current_dir.parent / "data" / "pre_process"

In [7]:
# Load the item_mapping
with open(pre_path / 'item_mapping.pkl', 'rb') as f:
    item_mapping = pickle.load(f)

In [8]:
df = pd.read_json(metadata_file_path, lines=True)
df.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,AMAZON FASHION,BALEAF Women's Long Sleeve Zip Beach Coverup U...,4.2,422,"[90% Polyester, 10% Spandex, Zipper closure, M...",[],31.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Women's UPF 50+ Front Zip Beach Co...,BALEAF,"[Clothing, Shoes & Jewelry, Women, Clothing, S...","{'Department': 'womens', 'Date First Available...",B09X1MRDN6,,,


# Analyzing the "title" column in the df DataFrame

In [9]:
df['title'][0]

"BALEAF Women's Long Sleeve Zip Beach Coverup UPF 50+ Sun Protection Hooded Cover Up Shirt Dress with Pockets"

In [10]:
# Checking the dtype of the "title" column
dtype_title = df['title'].dtype

# Checking the length of each title
df['title_length'] = df['title'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Calculating statistics for the length of titles
min_length = df['title_length'].min()
max_length = df['title_length'].max()
mean_length = df['title_length'].mean()
median_length = df['title_length'].median()

# Counting the number of null values
null_count = df['title'].isnull().sum()

# Summarizing the findings
summary = {
    "dtype": dtype_title,
    "min_length": min_length,
    "max_length": max_length,
    "mean_length": mean_length,
    "median_length": median_length,
    "null_count": null_count}

summary

{'dtype': dtype('O'),
 'min_length': 0,
 'max_length': 639,
 'mean_length': 82.39767757707456,
 'median_length': 80.0,
 'null_count': 0}

In [11]:
are_all_strings = df['title'].apply(lambda x: isinstance(x, str)).all()
print(are_all_strings)

True


# Create the embeddings dict using RoBerta

In [12]:
# Load the MiniLM model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [13]:
def generate_title_embeddings(df, model, device, column_to_encode="title", key_column="parent_asin"):

    i = 0 
    title_embeddings = {}
    for _, row in df.iterrows():
        
        try:
            title = row[column_to_encode]
            parent_asin = row[key_column]
            item_idx = item_mapping.get(parent_asin)  # Map parent_asin to item_idx
            
            with torch.no_grad():
                embedding = model.encode(title, convert_to_tensor=True, device=device)

                title_embeddings[item_idx] = embedding # Store the embedding
            
        except Exception as e:
            print(f"Error processing title for {parent_asin}: {e}")

        i+=1
        if i%100==0:
            print(f"User {i}")
    return title_embeddings

In [14]:
# Generate title embeddings
title_embeddings = generate_title_embeddings(df, model=model, device=device, column_to_encode="title", key_column="parent_asin")

User 100
User 200
User 300
User 400
User 500
User 600
User 700
User 800
User 900
User 1000
User 1100
User 1200
User 1300
User 1400
User 1500
User 1600
User 1700
User 1800
User 1900
User 2000
User 2100
User 2200
User 2300
User 2400
User 2500
User 2600
User 2700
User 2800
User 2900
User 3000
User 3100
User 3200
User 3300
User 3400
User 3500
User 3600
User 3700
User 3800
User 3900
User 4000
User 4100
User 4200
User 4300
User 4400
User 4500
User 4600
User 4700
User 4800
User 4900
User 5000
User 5100
User 5200
User 5300
User 5400
User 5500
User 5600
User 5700
User 5800
User 5900
User 6000
User 6100
User 6200
User 6300
User 6400
User 6500
User 6600
User 6700
User 6800
User 6900
User 7000
User 7100
User 7200
User 7300
User 7400
User 7500
User 7600
User 7700
User 7800
User 7900
User 8000
User 8100
User 8200
User 8300
User 8400
User 8500
User 8600
User 8700
User 8800
User 8900
User 9000
User 9100
User 9200
User 9300
User 9400
User 9500
User 9600
User 9700
User 9800
User 9900
User 10000
User 101

In [15]:
# Save embeddings
output_file = encoded_dir / "titles_encodings.pkl"
with open(output_file, 'wb') as f:
    pickle.dump(title_embeddings, f)

print(f"Title embeddings saved to {output_file}")

Title embeddings saved to /storage/yahlly/RecSys/data/encoded/titles_encodings.pkl


# Read the embeddings dict

In [16]:
encoded_titles_file = encoded_dir / "titles_encodings.pkl"

with open(encoded_titles_file, 'rb') as f:
    titles_embeddings = pickle.load(f)

In [17]:
titles_embeddings[147201].shape

torch.Size([384])

In [18]:
item_mapping['B09X1MRDN6']

131488

In [23]:
len(titles_embeddings)

198772