# Embedding Item Title & Tags

In [20]:
import pandas as pd
import ast
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from torch.nn.functional import normalize

Load in the items dataset. 

In [5]:
df = pd.read_csv('../data/fashion_items.csv')
df.set_index('item_id', inplace=True)
df['tags'] = df['tags'].apply(ast.literal_eval)
df.head()

Unnamed: 0_level_0,title,tags,category
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Boho Summer Maxi Dress,"[boho, dress, summer]",dresses
1,Minimalist Linen Blouse,"[minimalist, blouse, neutrals]",tops
2,Vintage Denim Jacket,"[vintage, jacket, denim]",outerwear
3,Cozy Knit Sweater,"[cozy, sweater, fall]",tops
4,Streetwear Graphic Hoodie,"[streetwear, hoodie, urban]",outerwear


In [6]:
# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Function to embed item titles, tags, and category
def embed_item(item):
    title_embedding = model.encode(item['title'], convert_to_tensor=True)
    tags_embedding = model.encode(' '.join(item['tags']), convert_to_tensor=True)
    category_embedding = model.encode(item['category'], convert_to_tensor=True)

    combined = (0.3 * title_embedding + 0.5 * tags_embedding + 0.2 * category_embedding) / 3
    return normalize(combined.unsqueeze(0), p=2, dim=1).squeeze(0)


In [11]:
# Use function to embed all items into a dictionary of item -> embedding
item_embeddings = {item_id: embed_item(item) for item_id, item in df.iterrows()}

Now that we have the embeddings, we can check item similarities. 

In [27]:
embedding_0 = item_embeddings[12]
embedding_1 = item_embeddings[23]

# Check similarity between two embeddings
similarity = cosine_similarity(embedding_0.cpu().reshape(1, -1), embedding_1.cpu().reshape(1, -1))
print(f"Cosine similarity between item 12 and item 23: {similarity[0][0]}")

Cosine similarity between item 12 and item 23: 0.6727408170700073


Seems to work! Let's verify by printing the details of those two items. 

In [28]:
print("Item 12 details:")
print(df.loc[12])
print("\nItem 23 details:")
print(df.loc[23])

Item 12 details:
title          Plaid Flannel Shirt
tags        [plaid, flannel, cozy]
category                      tops
Name: 12, dtype: object

Item 23 details:
title           Plaid A-Line Skirt
tags        [plaid, A-line, skirt]
category                   bottoms
Name: 23, dtype: object


Those items do seem pretty similar! They both have plaid elements. 0.7 seems appropriate because otherwise their different (bottoms vs tops, etc). Let's try another pair!

In [44]:
embedding_0 = item_embeddings[5]
embedding_1 = item_embeddings[77]

# Check similarity between two embeddings
similarity = cosine_similarity(embedding_0.cpu().reshape(1, -1), embedding_1.cpu().reshape(1, -1))
print(f"Cosine similarity between item 5 and item 77: {similarity[0][0]}")

Cosine similarity between item 5 and item 77: 0.16947853565216064


In [45]:
print("Item 5 details:")
print(df.loc[5])
print("\nItem 77 details:")
print(df.loc[77])

Item 5 details:
title                  Sustainable Yoga Set
tags        [sustainable, athleisure, yoga]
category                         activewear
Name: 5, dtype: object

Item 77 details:
title       Tailored Wool Blazer with Lining
tags         [tailored, wool blazer, lining]
category                           outerwear
Name: 77, dtype: object


That seems pretty good, these two items don't have much in common. 

Now we should save this embeddings dataset for later. 

In [50]:
# We need to reformat the embeddings to be compatible with a pandas DataFrame
# If we do this as below, we get a dataframe with 383 columns, one for each embedding dimension
# If we want to save this as a CSV, we need to convert it to a list
item_embeddings = {item_id: embedding.cpu().numpy().tolist() for item_id, embedding in item_embeddings.items()}
embeddings_df = pd.DataFrame.from_dict(item_embeddings, orient='index')
embeddings_df.rename(columns={'index': 'item_id'}, inplace=True)
#embeddings_df.set_index("item_id", inplace=True)
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.074143,0.06405,-0.009264,0.046263,-0.027863,-0.048036,0.06423,-0.072758,-0.083279,-0.023699,...,-0.032186,-0.009203,-0.015632,0.013018,0.038265,0.057547,0.033841,-0.040459,-0.005354,-0.016443
1,-0.053667,0.086153,0.008184,-0.027359,0.064028,-0.062513,0.064731,0.053492,-0.086663,0.04204,...,-0.039931,-0.055881,0.070485,0.079482,-0.065805,0.01045,0.083477,-0.020643,0.026802,0.023278
2,-0.103944,0.160174,0.006674,0.032914,0.023432,0.015533,0.124695,-0.012912,-0.071057,0.007715,...,-0.04839,-0.047802,-0.056081,0.034009,0.005964,0.015365,0.026713,-0.099851,-0.039444,0.01975
3,-0.068707,0.033529,0.007046,0.08301,0.072068,0.020052,0.061295,0.038851,0.006501,0.060401,...,-0.073263,-0.021329,-0.063976,0.042687,0.028866,0.015143,-0.02514,-0.103199,-0.039904,0.047723
4,-0.069275,0.071025,0.019378,0.039093,0.075728,-0.033218,0.122828,-0.019662,-0.06492,-0.04029,...,-0.048714,-0.067688,-0.049945,-0.00556,-0.028457,0.019639,-0.037464,-0.115767,-0.017989,0.000391


That's not bad, but maybe it would be easier to just store the actual list in one column? Let's try it this way though first. 

In [52]:
embedding_0 = list(embeddings_df.loc[0])
print(f"Embedding for item 0: {embedding_0}")

Embedding for item 0: [-0.07414272427558899, 0.0640496164560318, -0.009264148771762848, 0.046263132244348526, -0.02786252275109291, -0.04803597182035446, 0.0642300546169281, -0.07275786250829697, -0.08327941596508026, -0.023699209094047546, 0.03779444098472595, -0.09349503368139267, 0.025647640228271484, -0.06282330304384232, 0.060157909989356995, 0.07073774188756943, 0.09871508926153183, -0.014032882638275623, 0.02206476591527462, -0.03884509950876236, -0.07476118206977844, -0.055835600942373276, -0.0180149357765913, 0.18124601244926453, -0.045053865760564804, -0.03811192139983177, 0.037333983927965164, 0.06293008476495743, -0.08115700632333755, -0.03456141799688339, -0.08519710600376129, 0.03227163851261139, 0.07083224505186081, 0.005300506949424744, -0.04627120867371559, 0.06457676738500595, 0.06460494548082352, -0.04344898462295532, -0.0004011116689071059, 0.08597902953624725, -0.04849208518862724, -0.01107722707092762, -0.05858822166919708, -0.015387576073408127, 0.059832934290170

I think that's ok for now. Let's save it so we can use it later. 

In [54]:
embeddings_df.to_csv('../data/item_embeddings.csv', index=False)