In [1]:
!pip install transformers
!pip install sentence_transformers

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m844.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.49.0-py3-none-any.w

In [2]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from transformers import RobertaModel, RobertaTokenizer
from sentence_transformers import SentenceTransformer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [17]:
# Get the current directory
current_dir = Path.cwd()
encoded_dir = current_dir.parent / "data" / "encoded"
encoded_dir_per_feature = encoded_dir / "per_feature"
metadata_file_path = current_dir.parent / "data" / "data_and_test_files" / "items_metadata.jsonl"
pre_path = current_dir.parent / "data" / "pre_process"

In [5]:
# Load the item_mapping
with open(pre_path / 'item_mapping.pkl', 'rb') as f:
    item_mapping = pickle.load(f)

In [6]:
df = pd.read_json(metadata_file_path, lines=True)
df.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,AMAZON FASHION,BALEAF Women's Long Sleeve Zip Beach Coverup U...,4.2,422,"[90% Polyester, 10% Spandex, Zipper closure, M...",[],31.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Women's UPF 50+ Front Zip Beach Co...,BALEAF,"[Clothing, Shoes & Jewelry, Women, Clothing, S...","{'Department': 'womens', 'Date First Available...",B09X1MRDN6,,,


In [7]:
df.dtypes

main_category       object
title               object
average_rating     float64
rating_number        int64
features            object
description         object
price               object
images              object
videos              object
store               object
categories          object
details             object
parent_asin         object
bought_together    float64
subtitle            object
author              object
dtype: object

# Fix the data (convert dtypes to str and price to numeric)

### Convert the columns to the correct dtypes:

In [8]:
import numpy as np

# Clean the 'price' column: Replace non-numeric values with NaN
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Correct the dtypes for the rest of the columns
df = df.astype({
    'main_category': 'string',
    'title': 'string',
    'average_rating': 'float64',
    'rating_number': 'int64',
    'features': 'object',  # Leave as object since it's a list
    'description': 'object',  # Leave as object since it's a list
    'images': 'object',  # Leave as object since it's a list
    'videos': 'object',  # Leave as object since it's a list
    'store': 'string',
    'categories': 'object',  # Leave as object since it's a list
    'details': 'object',  # Leave as object since it's a dictionary
    'parent_asin': 'string',
    'bought_together': 'float64',
    'subtitle': 'string',
    'author': 'string'
})

# Verify the updated dtypes
print(df.dtypes)

main_category      string[python]
title              string[python]
average_rating            float64
rating_number               int64
features                   object
description                object
price                     float64
images                     object
videos                     object
store              string[python]
categories                 object
details                    object
parent_asin        string[python]
bought_together           float64
subtitle           string[python]
author             string[python]
dtype: object


### turn features, description, categories and details into str.

In [9]:
# Convert lists to string by joining their elements with a delimiter (e.g., space)
df['features'] = df['features'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
df['description'] = df['description'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
df['categories'] = df['categories'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Convert dictionary in 'details' to string (you can modify this to extract specific keys if needed)
df['details'] = df['details'].apply(lambda x: ' '.join(f"{k}: {v}" for k, v in x.items()) if isinstance(x, dict) else str(x))

# # Now concatenate these columns into a single long string
# df['long_string_with_markers'] = df['features'] + ' ' + df['description'] + ' ' + df['categories'] + ' ' + df['details']


In [10]:
df.dtypes

main_category      string[python]
title              string[python]
average_rating            float64
rating_number               int64
features                   object
description                object
price                     float64
images                     object
videos                     object
store              string[python]
categories                 object
details                    object
parent_asin        string[python]
bought_together           float64
subtitle           string[python]
author             string[python]
dtype: object

In [11]:
df = df.astype({
    'main_category': 'string',
    'title': 'string',
    'average_rating': 'float64',
    'rating_number': 'int64',
    'features': 'string',  # changed to string
    'description': 'string', # changed to string
    'images': 'object',  # Leave as object since it's a list
    'videos': 'object',  # Leave as object since it's a list
    'store': 'string',
    'categories': 'string',  # changed to string
    'details': 'string',  # changed to string
    'parent_asin': 'string',
    'bought_together': 'float64',
    'subtitle': 'string',
    'author': 'string'
})

In [12]:
df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,AMAZON FASHION,BALEAF Women's Long Sleeve Zip Beach Coverup U...,4.2,422,"90% Polyester, 10% Spandex Zipper closure Mach...",,31.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Women's UPF 50+ Front Zip Beach Co...,BALEAF,"Clothing, Shoes & Jewelry Women Clothing Swims...",Department: womens Date First Available: April...,B09X1MRDN6,,,
1,AMAZON FASHION,"SAS Women's, Relaxed Sandal",4.7,618,Made in the USA Suede sole Heel measures appro...,"Unwind, leave your worries behind, and simply ...",188.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],SAS,"Clothing, Shoes & Jewelry Women Shoes Sandals ...",Product Dimensions: 10 x 15 x 6 inches; 2 Poun...,B0944VG4Y4,,,
2,AMAZON FASHION,SheIn Women's Basic Stretch Plaid Mini Bodycon...,3.8,999,Zipper closure Fabric has some stretch; Please...,,12.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'SUPER CUTE plaid skirt!', 'url': '...",SheIn,"Clothing, Shoes & Jewelry Novelty & More Cloth...",Department: womens Date First Available: Septe...,B08JGGF5TJ,,,
3,AMAZON FASHION,Amazon Essentials Women's Surplice Dress (Avai...,4.4,7096,"95% Viscose, 5% Elastane Imported No Closure c...",Amazon Essentials is focused on creating affor...,19.92,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Easy throw on dress! Comfortable, ...",Amazon Essentials,"Clothing, Shoes & Jewelry Women Clothing Dress...",Item model number: AE1932668 Department: women...,B096HDRB6R,,,
4,AMAZON FASHION,"Fotociti Yoga Shorts for Women – 5"" High Waist...",4.5,2911,"92% Polyester, 8% Spandex Imported Pull On clo...",,15.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Cute Blue Camo Yoga Bike Shorts Re...,Fotociti,"Clothing, Shoes & Jewelry Women Clothing Activ...",Package Dimensions: 10 x 8 x 0.6 inches; 7.04 ...,B089YJ8P6X,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220885,AMAZON FASHION,weryerker 7 Chakra GemStones Empty Necklace Co...,4.7,302,【Chakra Healing Crystals Set】 - the package in...,,9.92,[{'thumb': 'https://m.media-amazon.com/images/...,[],weryerker,"Clothing, Shoes & Jewelry Women Jewelry Necklaces",Department: womens Date First Available: July ...,B0B5L18YMS,,,
220886,AMAZON FASHION,Bukesiyi Sasquatch Hat Bigfoot Embroidered Tru...,4.8,54,"65% Polyester, 35% Cotton 进口 Snap closure Hand...",,16.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],Bukesiyi,"Clothing, Shoes & Jewelry Men Accessories Hats...",Product Dimensions: 10.2 x 7.8 x 4.6 inches; 2...,B0B8VTSJTJ,,,
220887,,4 Pairs Bohemian Vintage Dangle Earrings Retro...,4.5,5874,Package Includes: you will receive 4 pairs of ...,,9.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],meekoo,"Clothing, Shoes & Jewelry Women Jewelry Earrin...","Date First Available: May 10, 2023 Manufacture...",B0C4TC78JH,,,
220888,AMAZON FASHION,Steve Madden Men's Troopah-c Combat Boot,3.7,225,100% Leather Imported Synthetic sole Shaft mea...,"Rugged, well-worn leather adds vintage appeal ...",114.07,[{'thumb': 'https://m.media-amazon.com/images/...,[],Steve Madden,"Clothing, Shoes & Jewelry Men Shoes Boots Moto...",Item Weight: 1.19 Pounds Item model number: TR...,B075Y7SBK1,,,


In [16]:
encoded_dir_per_feature

PosixPath('/storage/yahlly/RecSys/data/encoded/encoded/per_feature')

In [21]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
model.eval()

# Ensure GPU is used if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to get DistilBERT embeddings (mean pooling)
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Normalize float columns
scaler = MinMaxScaler()

# Define columns to embed and normalize
text_columns = ['features', 'description', 'details', 'title', 'subtitle', 'author', 'main_category', 'store', 'categories']
float_columns = ['average_rating', 'price']

# Ensure text columns are treated as strings
for col in text_columns:
    df[col] = df[col].fillna("").astype(str)
df[float_columns] = df[float_columns].fillna(0)
# Normalize float columns
df[float_columns] = scaler.fit_transform(df[float_columns])

# Embed and save each text column
for col in text_columns:
    print(f"Embedding column: {col}")
    df[f"{col}_embedding"] = df[col].apply(get_embedding)

    # Save embeddings to .npy file
    np.save(os.path.join(encoded_dir_per_feature, f"{col}_embeddings.npy"), np.stack(df[f"{col}_embedding"].values))
    print(f"Saved {col} embeddings to {col}_embeddings.npy")

# Save normalized floats
np.save(os.path.join(encoded_dir_per_feature, "normalized_floats.npy"), df[float_columns].values)
print("Saved normalized float values to normalized_floats.npy")


Saved normalized float values to normalized_floats.npy


### Load embedded dict

In [26]:
embedding_dict = torch.load('/storage/yahlly/RecSys/data/encoded/embedding_dict_with_price_longformer_idx.pt')

# Print the keys and the shape of one of the embeddings to check
print(f"Number of entries in the embedding dictionary: {len(embedding_dict)}")
sample_key = list(embedding_dict.keys())[4]  # Get a sample key
print(f"Sample key (parent_asin): {sample_key}")
print(f"Shape of the embedding for this key: {embedding_dict[sample_key].shape}")

Number of entries in the embedding dictionary: 198771
Sample key (parent_asin): 44993
Shape of the embedding for this key: torch.Size([1027])


In [27]:
embedding_dict

{131488: tensor([-0.0118, -0.0987, -0.2878,  ...,  0.0095,  0.8000,  0.0010],
        device='cuda:0'),
 49667: tensor([-0.0573, -0.1541, -0.4666,  ...,  0.0039,  0.7000,  0.0024],
        device='cuda:0'),
 13509: tensor([-0.0399, -0.1226, -0.3553,  ...,  0.0059,  0.8500,  0.0174],
        device='cuda:0'),
 98102: tensor([-0.0154, -0.0735, -0.3964,  ...,  0.0047,  0.8750,  0.0071],
        device='cuda:0'),
 44993: tensor([ 0.0026, -0.0701, -0.3936,  ...,  0.0297,  0.8750,  0.0089],
        device='cuda:0'),
 1965: tensor([ 6.8136e-03, -4.6083e-02, -3.4628e-01,  ..., -1.0000e+00,
          9.0000e-01,  7.7381e-04], device='cuda:0'),
 92911: tensor([-0.0280, -0.1065, -0.4042,  ...,  0.0059,  0.8500,  0.0031],
        device='cuda:0'),
 143124: tensor([ 4.3198e-02, -8.0334e-02, -3.8567e-01,  ..., -1.0000e+00,
          9.2500e-01,  3.2079e-04], device='cuda:0'),
 111847: tensor([ 0.0402, -0.0127, -0.5208,  ...,  0.0077,  0.8750,  0.0054],
        device='cuda:0'),
 4596: tensor([ 0.002