# Imports

In [1]:
!pip install nltk



In [2]:
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Data Preprocessing

In [21]:
data = []
file_path = "/content/marketing_sample_for_amazon_com-amazon_fashion_products__20200201_20200430__30k_data-1.ldjson"
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping invalid line {i}: {e}")
        except ValueError as e:
            print(f"Skipping invalid line {i}: {e}")

df = pd.DataFrame(data)

Selecting Required Columns for Recommendations:
These attributes help in comparing and finding similar products. The chosen columns are:

1. uniq_id: Unique identifier for each product.
2. asin: Amazon's unique product identifier.
3. product_name: Name of the product, useful for textual similarity.
4. medium: URLs to product images.
5. brand: Product brand.
6. colour: Color of the product.
7. sales_price: Price of the product.
8. rating: Customer ratings.
9. meta_keywords: Keywords related to the product.

These columns capture essential product features that influence customer preferences and product similarities.

In [22]:
df = df[['uniq_id', 'asin', 'product_name','medium', 'brand', 'colour', 'sales_price', 'rating', 'meta_keywords']]
df.head()

Unnamed: 0,uniq_id,asin,product_name,medium,brand,colour,sales_price,rating,meta_keywords
0,26d41bdc1495de290bc8e6062d927729,B07STS2W9T,LA' Facon Cotton Kalamkari Handblock Saree Blo...,https://images-na.ssl-images-amazon.com/images...,LA' Facon,,200.0,5.0,LA' Facon Cotton Kalamkari Handblock Saree Blo...
1,410c62298852e68f34c35560f2311e5a,B07N6TD2WL,Sf Jeans By Pantaloons Men's Plain Slim fit T-...,https://images-na.ssl-images-amazon.com/images...,,,265.0,3.6,Sf Jeans By Pantaloons Men's Plain Slim fit T-...
2,52e31bb31680b0ec73de0d781a23cc0a,B07WJ6WPN1,LOVISTA Cotton Gota Patti Tassel Traditional P...,https://images-na.ssl-images-amazon.com/images...,LOVISTA,,660.0,3.5,LOVISTA Cotton Gota Patti Tassel Traditional P...
3,25798d6dc43239c118452d1bee0fb088,B07PYSF4WZ,People Men's Printed Regular fit T-Shirt,https://images-na.ssl-images-amazon.com/images...,,,195.0,3.0,"People Men's Printed Regular fit T-Shirt,People"
4,ad8a5a196d515ef09dfdaf082bdc37c4,B082KXNM7X,Monte Carlo Grey Solid Cotton Blend Polo Colla...,https://images-na.ssl-images-amazon.com/images...,,,1914.0,5.0,Monte Carlo Grey Solid Cotton Blend Polo Colla...


In [23]:
# uniq_id, product_name, tags [meta_keywords, brand, colour]
df.isnull().sum()
# remove the rows having null values
df.dropna(axis=0, inplace=True)

In [24]:
# Clean the colors
df['colour'] = df['colour'].str.split(r'[\/\+\|\s,]+')
  # Remove the product having Duplicate title
df = df.drop_duplicates(subset=['product_name'])
# remove the product with very few words in product name
df = df[df['product_name'].apply(lambda x: len(x.split())>4)]

In [25]:
df.iloc[90]['meta_keywords']

'Doodle Mens Half Sleeve Poly Cotton Round Neck Tshirt - (Black/Red/White),Doodle Store'

In [13]:
import re

df['meta_keywords'] = df['meta_keywords'].apply(lambda x: re.sub(r'[^a-zA-Z0-9, ]', '', x))
df['meta_keywords'] = df['meta_keywords'].apply(lambda x: x.split(','))


In [14]:
df.iloc[5]['meta_keywords']

['Pooplu Mens Ajay Cotton Printed Round Neck Half Sleeves Black  White TShirt Common Names',
 ' Symbol Tshirts',
 'Pooplu']

In [26]:
import re
df['brand'] = df['brand'].str.replace(r'[^a-zA-Z0-9, ]', '')
df['brand'] = df['brand'].apply(lambda x: x.split(','))

df.head()


Unnamed: 0,uniq_id,asin,product_name,medium,brand,colour,sales_price,rating,meta_keywords
5,73fd23f631d4434fd4a41ecc1b9f5eed,B07X5M41BJ,Forest Club | Gym Wear | Sports Shorts| Shorts...,https://images-na.ssl-images-amazon.com/images...,[Forest Club],"[Black, Blue, Grey]",350.0,4.1,Forest Club | Gym Wear | Sports Shorts| Shorts...
6,14d7d34c3cacd6c9ed1aa57f284e37ad,B07H9RCVHQ,PrintOctopus Graphic Printed T-Shirt for Men C...,https://images-na.ssl-images-amazon.com/images...,[PrintOctopus],"[Black, Navy, Blue]",288.0,3.7,PrintOctopus Graphic Printed T-Shirt for Men C...
15,46005e6f4f253e551ede9b580cf83b73,B083ZKXVGN,Miss Chase Women's Solid Shoulder Cut-Out Half...,https://images-na.ssl-images-amazon.com/images...,[Miss Chase],"[Black, Maroon]",669.0,3.7,Miss Chase Women's Solid Shoulder Cut-Out Half...
20,2b1c8bedb40220c7312f0edeea2a31d2,B07K6PHHHM,PuJoy Women's Soft Cotton Traditional Bengali ...,https://images-na.ssl-images-amazon.com/images...,[PuJoy],"[Black, Based, Black, Based, 2, Black, White, ...",420.0,3.5,PuJoy Women's Cotton Traditional Bengali Handl...
22,8c8ef13b817747710eae99c1718c9be9,B0759HPKCJ,kalpit creations Men's Comfort Soft Cotton Bla...,https://images-na.ssl-images-amazon.com/images...,[kalpit creations],"[Black, Bottle, green, GREEN, NAVY, BLUE, Pink...",249.0,3.2,kalpit creations Men's Comfort Soft Cotton Bla...


In [27]:
df['tags'] = df.apply(lambda x: f"Meta keywords: {x['meta_keywords']} , Brand: {x['brand']} and Colours: {x['colour']}", axis=1)

df.head()

Unnamed: 0,uniq_id,asin,product_name,medium,brand,colour,sales_price,rating,meta_keywords,tags
5,73fd23f631d4434fd4a41ecc1b9f5eed,B07X5M41BJ,Forest Club | Gym Wear | Sports Shorts| Shorts...,https://images-na.ssl-images-amazon.com/images...,[Forest Club],"[Black, Blue, Grey]",350.0,4.1,Forest Club | Gym Wear | Sports Shorts| Shorts...,Meta keywords: Forest Club | Gym Wear | Sports...
6,14d7d34c3cacd6c9ed1aa57f284e37ad,B07H9RCVHQ,PrintOctopus Graphic Printed T-Shirt for Men C...,https://images-na.ssl-images-amazon.com/images...,[PrintOctopus],"[Black, Navy, Blue]",288.0,3.7,PrintOctopus Graphic Printed T-Shirt for Men C...,Meta keywords: PrintOctopus Graphic Printed T-...
15,46005e6f4f253e551ede9b580cf83b73,B083ZKXVGN,Miss Chase Women's Solid Shoulder Cut-Out Half...,https://images-na.ssl-images-amazon.com/images...,[Miss Chase],"[Black, Maroon]",669.0,3.7,Miss Chase Women's Solid Shoulder Cut-Out Half...,Meta keywords: Miss Chase Women's Solid Should...
20,2b1c8bedb40220c7312f0edeea2a31d2,B07K6PHHHM,PuJoy Women's Soft Cotton Traditional Bengali ...,https://images-na.ssl-images-amazon.com/images...,[PuJoy],"[Black, Based, Black, Based, 2, Black, White, ...",420.0,3.5,PuJoy Women's Cotton Traditional Bengali Handl...,Meta keywords: PuJoy Women's Cotton Traditiona...
22,8c8ef13b817747710eae99c1718c9be9,B0759HPKCJ,kalpit creations Men's Comfort Soft Cotton Bla...,https://images-na.ssl-images-amazon.com/images...,[kalpit creations],"[Black, Bottle, green, GREEN, NAVY, BLUE, Pink...",249.0,3.2,kalpit creations Men's Comfort Soft Cotton Bla...,Meta keywords: kalpit creations Men's Comfort ...


In [28]:
df.iloc[5]['tags']

"Meta keywords: Pooplu Mens Ajay Cotton Printed Round Neck Half Sleeves Black & White T.Shirt. Common Names, Symbol Tshirts,Pooplu , Brand: ['Pooplu'] and Colours: ['Black', 'White']"

In [29]:
new_df = df[['uniq_id', 'product_name', 'tags']]

In [30]:
new_df.head()

Unnamed: 0,uniq_id,product_name,tags
5,73fd23f631d4434fd4a41ecc1b9f5eed,Forest Club | Gym Wear | Sports Shorts| Shorts...,Meta keywords: Forest Club | Gym Wear | Sports...
6,14d7d34c3cacd6c9ed1aa57f284e37ad,PrintOctopus Graphic Printed T-Shirt for Men C...,Meta keywords: PrintOctopus Graphic Printed T-...
15,46005e6f4f253e551ede9b580cf83b73,Miss Chase Women's Solid Shoulder Cut-Out Half...,Meta keywords: Miss Chase Women's Solid Should...
20,2b1c8bedb40220c7312f0edeea2a31d2,PuJoy Women's Soft Cotton Traditional Bengali ...,Meta keywords: PuJoy Women's Cotton Traditiona...
22,8c8ef13b817747710eae99c1718c9be9,kalpit creations Men's Comfort Soft Cotton Bla...,Meta keywords: kalpit creations Men's Comfort ...


# Vectorization
similarity based on tags

In [31]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [32]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [33]:
similarity = cosine_similarity(vectors)

# Basic Version of Cosine Similarity Calculation

This section demonstrates a basic version of calculating cosine similarity between product vectors. We convert the vectors to NumPy arrays for easier manipulation and compute the similarity between a query vector and all other vectors.



In [36]:
vectors[1]

array([0, 0, 0, ..., 0, 0, 0])

In [38]:
# Convert to numpy arrays for easier manipulation
vectors_np = np.array(vectors)
query_vector_np = np.array(vectors[7]).reshape(1, -1)

# Calculate cosine similarity
cosine_similarities = cosine_similarity(query_vector_np, vectors_np).flatten()

# Get top 5 results
top_5_indices = cosine_similarities.argsort()[-5:][::-1]
top_5_similarities = cosine_similarities[top_5_indices]

# Display results
for idx, similarity in zip(top_5_indices, top_5_similarities):
    print(f"Vector: {idx}, Similarity: {similarity}")

Vector: 7, Similarity: 1.0
Vector: 341, Similarity: 0.7233393492970543
Vector: 3408, Similarity: 0.7082005278281703
Vector: 2377, Similarity: 0.6470588235294117
Vector: 1788, Similarity: 0.6470588235294117


# Enhanced Cosine Similarity Calculation Using Sentence Transformers

This enhanced version uses the SentenceTransformer library to generate embeddings for product descriptions, resulting in more accurate similarity calculations. The steps are as follows:

1. Import SentenceTransformer: Use a pre-trained model to generate embeddings.
2. Prepare Sentences: Convert the product descriptions (tags) to a list of sentences.
3. Load Pre-trained Model: Load the all-mpnet-base-v2 model from the sentence-transformers library.
4. Generate Embeddings: Use the model to encode the sentences into dense vector representations.

In [39]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m174.1/227.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [40]:
from sentence_transformers import SentenceTransformer
sentences = new_df['tags'].to_list()

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [41]:
# Convert to numpy arrays for easier manipulation
vectors_np = np.array(embeddings)
query_vector_np = np.array(embeddings[7]).reshape(1, -1)

# Calculate cosine similarity
cosine_similarities = cosine_similarity(query_vector_np, vectors_np).flatten()

# Get top 5 results
top_5_indices = cosine_similarities.argsort()[-5:][::-1]
top_5_similarities = cosine_similarities[top_5_indices]

# Display results
for idx, similarity in zip(top_5_indices, top_5_similarities):
    print(f"Vector: {idx}, Similarity: {similarity}")

Vector: 7, Similarity: 1.0
Vector: 341, Similarity: 0.8556002974510193
Vector: 606, Similarity: 0.8445487022399902
Vector: 2377, Similarity: 0.8419944047927856
Vector: 1514, Similarity: 0.8358587622642517


In [42]:
def find_top_k_similar_products(df, vectors_np, top_k):
    # Prepare a new DataFrame to store the results
    results = []

    # Iterate over each vector and find the top k similar products
    for i, query_vector_np in enumerate(vectors_np):
        query_vector_np = query_vector_np.reshape(1, -1)

        # Calculate cosine similarity
        cosine_similarities = cosine_similarity(query_vector_np, vectors_np).flatten()

        # Get top k results (excluding the product itself)
        top_k_indices = cosine_similarities.argsort()[-(top_k + 1):-1][::-1]
        top_k_similarities = cosine_similarities[top_k_indices]

        # Prepare results for the current product
        product_results = {
            'uniq_id': df.iloc[i]['uniq_id'],
            'product_name': df.iloc[i]['product_name'],
            'tags': df.iloc[i]['tags'],
            'similar_products': []
        }

        for idx, similarity in zip(top_k_indices, top_k_similarities):
            similar_product = {
                'similar_uniq_id': df.iloc[idx]['uniq_id'],
                'similar_product_name': df.iloc[idx]['product_name'],
                'similar_tags': df.iloc[idx]['tags'],
                'similarity': similarity
            }
            product_results['similar_products'].append(similar_product)

        results.append(product_results)

    # Convert the results to a DataFrame
    new_df = pd.DataFrame(results)

    return new_df

In [43]:
df.head()

Unnamed: 0,uniq_id,asin,product_name,medium,brand,colour,sales_price,rating,meta_keywords,tags
5,73fd23f631d4434fd4a41ecc1b9f5eed,B07X5M41BJ,Forest Club | Gym Wear | Sports Shorts| Shorts...,https://images-na.ssl-images-amazon.com/images...,[Forest Club],"[Black, Blue, Grey]",350.0,4.1,Forest Club | Gym Wear | Sports Shorts| Shorts...,Meta keywords: Forest Club | Gym Wear | Sports...
6,14d7d34c3cacd6c9ed1aa57f284e37ad,B07H9RCVHQ,PrintOctopus Graphic Printed T-Shirt for Men C...,https://images-na.ssl-images-amazon.com/images...,[PrintOctopus],"[Black, Navy, Blue]",288.0,3.7,PrintOctopus Graphic Printed T-Shirt for Men C...,Meta keywords: PrintOctopus Graphic Printed T-...
15,46005e6f4f253e551ede9b580cf83b73,B083ZKXVGN,Miss Chase Women's Solid Shoulder Cut-Out Half...,https://images-na.ssl-images-amazon.com/images...,[Miss Chase],"[Black, Maroon]",669.0,3.7,Miss Chase Women's Solid Shoulder Cut-Out Half...,Meta keywords: Miss Chase Women's Solid Should...
20,2b1c8bedb40220c7312f0edeea2a31d2,B07K6PHHHM,PuJoy Women's Soft Cotton Traditional Bengali ...,https://images-na.ssl-images-amazon.com/images...,[PuJoy],"[Black, Based, Black, Based, 2, Black, White, ...",420.0,3.5,PuJoy Women's Cotton Traditional Bengali Handl...,Meta keywords: PuJoy Women's Cotton Traditiona...
22,8c8ef13b817747710eae99c1718c9be9,B0759HPKCJ,kalpit creations Men's Comfort Soft Cotton Bla...,https://images-na.ssl-images-amazon.com/images...,[kalpit creations],"[Black, Bottle, green, GREEN, NAVY, BLUE, Pink...",249.0,3.2,kalpit creations Men's Comfort Soft Cotton Bla...,Meta keywords: kalpit creations Men's Comfort ...


In [44]:
# Example function to find top k similar products
def find_top_k_similar_products(df, vectors_np, top_k, details_df):
    # Prepare a new DataFrame to store the results
    results = []

    # Iterate over each vector and find the top k similar products
    for i, query_vector_np in enumerate(vectors_np):
        query_vector_np = query_vector_np.reshape(1, -1)

        # Calculate cosine similarity
        cosine_similarities = cosine_similarity(query_vector_np, vectors_np).flatten()

        # Get top k results (excluding the product itself)
        top_k_indices = cosine_similarities.argsort()[-(top_k + 1):-1][::-1]
        top_k_similarities = cosine_similarities[top_k_indices]

        # Prepare results for the current product
        product_results = {
            'uniq_id': df.iloc[i]['uniq_id'],
            'product_name': df.iloc[i]['product_name'],
            'tags': df.iloc[i]['tags'],
            'brand': details_df.loc[details_df['uniq_id'] == df.iloc[i]['uniq_id'], 'brand'].values[0],
            'colour': details_df.loc[details_df['uniq_id'] == df.iloc[i]['uniq_id'], 'colour'].values[0],
            'rating': details_df.loc[details_df['uniq_id'] == df.iloc[i]['uniq_id'], 'rating'].values[0],
            'medium': details_df.loc[details_df['uniq_id'] == df.iloc[i]['uniq_id'], 'medium'].values[0],
            'sales_price': details_df.loc[details_df['uniq_id'] == df.iloc[i]['uniq_id'], 'sales_price'].values[0] ,
            'similar_products': []
        }

        for idx, similarity in zip(top_k_indices, top_k_similarities):
            similar_product_details = details_df.loc[details_df['uniq_id'] == df.iloc[idx]['uniq_id']].iloc[0]
            similar_product = {
                'similar_uniq_id': df.iloc[idx]['uniq_id'],
                'similar_product_name': df.iloc[idx]['product_name'],
                'similar_tags': df.iloc[idx]['tags'],
                'brand': similar_product_details['brand'],
                'colour': similar_product_details['colour'],
                'rating': similar_product_details['rating'],
                'medium': similar_product_details['medium'],
                'sales_price': similar_product_details['sales_price'],
                'similarity': similarity
            }
            product_results['similar_products'].append(similar_product)

        results.append(product_results)

    # Convert the results to a DataFrame
    new_df = pd.DataFrame(results)

    return new_df

In [45]:
top_k = 20
vectors_np = np.array(embeddings)
similar_products_df = find_top_k_similar_products(new_df, vectors_np, top_k, df)

# Display the new DataFrame with top k similar products
print(similar_products_df)
similar_products_df['uniq_id'] == "46005e6f4f253e551ede9b580cf83b73"

                               uniq_id  \
0     73fd23f631d4434fd4a41ecc1b9f5eed   
1     14d7d34c3cacd6c9ed1aa57f284e37ad   
2     46005e6f4f253e551ede9b580cf83b73   
3     2b1c8bedb40220c7312f0edeea2a31d2   
4     8c8ef13b817747710eae99c1718c9be9   
...                                ...   
3828  cdf6103f74e2485a2f4cf1db27ca59db   
3829  7aeacf73b7ec76724e59d3e5f8cd980b   
3830  51d7066ce25e7ea345e3967e03f5b94f   
3831  9de98a0e7accdec873db3c15449a5249   
3832  2ecada524df6ff8c2d0c53a249cfcddc   

                                           product_name  \
0     Forest Club | Gym Wear | Sports Shorts| Shorts...   
1     PrintOctopus Graphic Printed T-Shirt for Men C...   
2     Miss Chase Women's Solid Shoulder Cut-Out Half...   
3     PuJoy Women's Soft Cotton Traditional Bengali ...   
4     kalpit creations Men's Comfort Soft Cotton Bla...   
...                                                 ...   
3828  Varkala Silk Sarees Women's Soft katan Silk Wo...   
3829  Vadmans Unisex Ri

0       False
1       False
2        True
3       False
4       False
        ...  
3828    False
3829    False
3830    False
3831    False
3832    False
Name: uniq_id, Length: 3833, dtype: bool

In [46]:
similar_products_df.head()

Unnamed: 0,uniq_id,product_name,tags,brand,colour,rating,medium,sales_price,similar_products
0,73fd23f631d4434fd4a41ecc1b9f5eed,Forest Club | Gym Wear | Sports Shorts| Shorts...,Meta keywords: Forest Club | Gym Wear | Sports...,[Forest Club],"[Black, Blue, Grey]",4.1,https://images-na.ssl-images-amazon.com/images...,350.0,[{'similar_uniq_id': 'b0ef560126020ab967ed53da...
1,14d7d34c3cacd6c9ed1aa57f284e37ad,PrintOctopus Graphic Printed T-Shirt for Men C...,Meta keywords: PrintOctopus Graphic Printed T-...,[PrintOctopus],"[Black, Navy, Blue]",3.7,https://images-na.ssl-images-amazon.com/images...,288.0,[{'similar_uniq_id': 'fcb4b9af8424c1e260e033fa...
2,46005e6f4f253e551ede9b580cf83b73,Miss Chase Women's Solid Shoulder Cut-Out Half...,Meta keywords: Miss Chase Women's Solid Should...,[Miss Chase],"[Black, Maroon]",3.7,https://images-na.ssl-images-amazon.com/images...,669.0,[{'similar_uniq_id': '25a518b8a2885b94de4190ec...
3,2b1c8bedb40220c7312f0edeea2a31d2,PuJoy Women's Soft Cotton Traditional Bengali ...,Meta keywords: PuJoy Women's Cotton Traditiona...,[PuJoy],"[Black, Based, Black, Based, 2, Black, White, ...",3.5,https://images-na.ssl-images-amazon.com/images...,420.0,[{'similar_uniq_id': '21b0c2baaca3932a1d462d7e...
4,8c8ef13b817747710eae99c1718c9be9,kalpit creations Men's Comfort Soft Cotton Bla...,Meta keywords: kalpit creations Men's Comfort ...,[kalpit creations],"[Black, Bottle, green, GREEN, NAVY, BLUE, Pink...",3.2,https://images-na.ssl-images-amazon.com/images...,249.0,[{'similar_uniq_id': 'db5a3f0352ad8765e7bb00ca...


In [47]:

import pandas as pd
def show_similar_products_details(df, product_id):
  """
  This function takes a product ID and returns a DataFrame containing details of similar products.

  Args:
      df: DataFrame containing product information and similarity scores.
      product_id: ID of the product for which to find similar products.

  Returns:
      DataFrame containing details of similar products.
  """

  similar_products = df[df['uniq_id'] == product_id]['similar_products'].values[0]
  details_df = pd.DataFrame(similar_products)
  return details_df

product_id = "46005e6f4f253e551ede9b580cf83b73"
similar_products_details = show_similar_products_details(similar_products_df, product_id)

print(similar_products_details['similar_product_name'])


0     Miss Chase Women's Black and White Round Neck ...
1                  Miss Chase Women's Skater Midi Dress
2     AARA Women's Notch Neck Sleeveless Business Co...
3     BESIVA Women's Round Neck Full Sleeve Polyeste...
4        BESIVA Women's Round Neck Bodycon Jersey Dress
5          Beautees Girls' Big Three Tiered Swing Dress
6     Hive91 Checkered Wrap Dress for Women, 3/4 Sleeve
7                   Harpa V-Neck Vertical Stripes Dress
8         ADDYVERO Women's Ruffle Shoulder Skater Dress
9                   Harpa Women's Halterneck Midi Dress
10        Dresszip Collection Women's Half Ruffle Saree
11    AARA Womens Pink Cotton Pleated Sleeveless Sea...
12    Bidhan Sleeveless Cotton Casual Wear Dress for...
13            SightBomb ONE Shoulder Side Frill Bodycon
14      Lagotto Rayon Crepe fit & flary Dress for Women
15                         Harpa Round Neck Solid Dress
16    Wild Sparrow Printed Sleeveless Cutout-Back Sh...
17    HK Enterprise Women's Western High Low Kne

In [48]:
def save_similar_products_to_pickle(similar_products_df, output_file):
    with open(output_file, 'wb') as file:
        pickle.dump(similar_products_df, file)
    print(f"Pickle file '{output_file}' created successfully!")

In [49]:
output_pickle_file = 'similar_products.pkl'
save_similar_products_to_pickle(similar_products_df, output_pickle_file)


Pickle file 'similar_products.pkl' created successfully!


In [50]:
def find_similar_products_by_uniq_id_from_stored(similar_products_df, uniq_id):
    # Check if the uniq_id exists in the DataFrame
    if uniq_id not in similar_products_df['uniq_id'].values:
        raise ValueError("Product ID not found")

    # Retrieve the similar products for the given uniq_id
    product_row = similar_products_df[similar_products_df['uniq_id'] == uniq_id].iloc[0]
    similar_products = product_row['similar_products']

    return similar_products

In [51]:
similars = find_similar_products_by_uniq_id_from_stored(
    similar_products_df,
    uniq_id="8c8ef13b817747710eae99c1718c9be9"
)
for s in similars:
    print(s['similar_product_name'])

Kalpit Men's Basic Cotton Round Neck Half Sleeve Solid T-Shirts [Available in Many Colours]
Khadi Vastra Men Solid Full Sleeve Cotton Formal Spread Shirt - Pack of 2
Twist Men's Cotton Linen Chinese Collar Short Kurta Shirt
WearIndia Half Sleeve Solid Color Cotton Polo T Shirt for Men
Atrangi Store Men's Cotton Valar Printed Half Sleeve Round Neck T-Shirt
Colors & Blends Men's Cotton Polo T-Shirt
BRANDID Men's Satin Regular Fit Formal Wear Shirt
SHAUN Men's Full Sleeve Cotton T-Shirt
JANGOBOY Shawl Neck Solid Cotton T-Shirt
SHAUN Men's Hooded Cotton Full Sleeve T-Shirt
Colors & Blends - Women's Cotton-Lycra Polo T-shirt
Cool N Comfort Striped Men's Round Neck Black, Dark Blue, Maroon T-Shirt (CNC510_$p_Black-Mustard)
Caseria Men's Cotton Graphic Printed Half Sleeve T-Shirt - Chala Basuya
Casotec MahaKal Designer Men's Cotton Graphic T-Shirt
BlueAura 100% Cotton Polo T Shirt for Men
Patrorna Blended Women's Shirt Collar A-Line Kurta/Kurti (GR7V03)
Maharaja Men's Wear Combo of 3 PolyBlen

In [52]:
# save 20 similar products in a df
search_term = "tshirt"
search_results = df[df['product_name'].str.contains(search_term, case=False, na=False)]
print(search_results.head(20))

                               uniq_id        asin  \
23    152b7c5840ddc7ddd8b520a450930dcc  B07H9TTVNG   
36    3916fb6fc88efdd2a9320b65390b2c97  B07VKJ96MK   
65    526d18a68f0e590d230d74aedb0b28da  B0757XGJ87   
76    4cb1313b415abd5037d961f6e347a52e  B07L88MXNP   
265   c5bdd95670bed4b4d43137c6e4c9c087  B07D6RPHMP   
514   09648f8e94a486a4cdd2735bf833e846  B07MLMC3V2   
548   9c408a0959af897118bc79a9dc836aeb  B077NF2DZL   
551   23560a75acae066e723210d0e6ad45fa  B07DRJD6FY   
592   02e7078b2a98ebffd2fa8c5faca44b82  B01CQZHG26   
689   9e0fd54ca9e50cd6b74bf48f10f10519  B07XJPTXNK   
714   930f45a449db709eb09380f0eac2de3a  B07R7W318J   
776   dee837f7814a501a16b56e699ae03529  B07NDVMVSP   
804   793fa42c73390e1ccebb5f89d42b025d  B079HYNTJ5   
821   8b1fd9b0fc0c9cb35ec62ecde73ebb79  B07F5VJ5KZ   
840   99449c584b46fb3a6a9e25eaa7e519a9  B083TD11BW   
852   e099104912c045b90a0bfba6a05f3f62  B07W5WS63S   
914   845f1c4f0eec84adf1a3a717b7bf8770  B07WZF2DT7   
1010  25e5899b8f2145da94bd3b

# Optimized Similarity Search Using Qdrant and Sentence Transformers

In this section, we combine the power of Qdrant, an efficient vector search engine, with advanced sentence embeddings from the SentenceTransformer library to optimize the similarity search function for handling large datasets.

### Steps:
1. Generate High-Quality Embeddings: Using SentenceTransformer to create dense vector representations of product descriptions.
2. Utilize Qdrant for Efficient Vector Searches: Leverage Qdrant to perform fast and accurate nearest neighbor searches.

### Process:
1. Generate Embeddings Using SentenceTransformer:

* We use the all-mpnet-base-v2 model from the sentence-transformers library to generate high-quality embeddings for product descriptions.
* This model is based on state-of-the-art NLP techniques, providing superior semantic understanding of the text.

2. Index Embeddings Using Qdrant:

* Qdrant is a high-performance vector similarity search engine, designed to handle large datasets efficiently.
* We use Qdrant to index our embeddings and perform fast nearest neighbor searches.

3. Search for Similar Products Using Qdrant:

* Perform a search using Qdrant's efficient vector search capabilities to find the top similar products.

In [53]:
!pip install -U qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.10.1-py3-none-any.whl (254 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/254.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m204.8/254.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.1/254.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.64.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx[http2]>=0.20.0 (from qdrant-client)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker<3.0.0,>=2

In [54]:
new_df.head()

Unnamed: 0,uniq_id,product_name,tags
5,73fd23f631d4434fd4a41ecc1b9f5eed,Forest Club | Gym Wear | Sports Shorts| Shorts...,Meta keywords: Forest Club | Gym Wear | Sports...
6,14d7d34c3cacd6c9ed1aa57f284e37ad,PrintOctopus Graphic Printed T-Shirt for Men C...,Meta keywords: PrintOctopus Graphic Printed T-...
15,46005e6f4f253e551ede9b580cf83b73,Miss Chase Women's Solid Shoulder Cut-Out Half...,Meta keywords: Miss Chase Women's Solid Should...
20,2b1c8bedb40220c7312f0edeea2a31d2,PuJoy Women's Soft Cotton Traditional Bengali ...,Meta keywords: PuJoy Women's Cotton Traditiona...
22,8c8ef13b817747710eae99c1718c9be9,kalpit creations Men's Comfort Soft Cotton Bla...,Meta keywords: kalpit creations Men's Comfort ...


In [55]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [56]:
q_df = df
q_df['tags'] = q_df.apply(lambda x: encoder.encode(f"Meta keywords: {x['meta_keywords']} , Brand: {x['brand']}, Colours: {x['colour']}").tolist(), axis=1)

q_df.head()

Unnamed: 0,uniq_id,asin,product_name,medium,brand,colour,sales_price,rating,meta_keywords,tags
5,73fd23f631d4434fd4a41ecc1b9f5eed,B07X5M41BJ,Forest Club | Gym Wear | Sports Shorts| Shorts...,https://images-na.ssl-images-amazon.com/images...,[Forest Club],"[Black, Blue, Grey]",350.0,4.1,Forest Club | Gym Wear | Sports Shorts| Shorts...,"[0.03031603805720806, 0.03210923448204994, 0.0..."
6,14d7d34c3cacd6c9ed1aa57f284e37ad,B07H9RCVHQ,PrintOctopus Graphic Printed T-Shirt for Men C...,https://images-na.ssl-images-amazon.com/images...,[PrintOctopus],"[Black, Navy, Blue]",288.0,3.7,PrintOctopus Graphic Printed T-Shirt for Men C...,"[-0.022418325766921043, 0.09316255897283554, -..."
15,46005e6f4f253e551ede9b580cf83b73,B083ZKXVGN,Miss Chase Women's Solid Shoulder Cut-Out Half...,https://images-na.ssl-images-amazon.com/images...,[Miss Chase],"[Black, Maroon]",669.0,3.7,Miss Chase Women's Solid Shoulder Cut-Out Half...,"[-0.0037004766054451466, 0.028077349066734314,..."
20,2b1c8bedb40220c7312f0edeea2a31d2,B07K6PHHHM,PuJoy Women's Soft Cotton Traditional Bengali ...,https://images-na.ssl-images-amazon.com/images...,[PuJoy],"[Black, Based, Black, Based, 2, Black, White, ...",420.0,3.5,PuJoy Women's Cotton Traditional Bengali Handl...,"[0.00869398470968008, -0.022032098844647408, -..."
22,8c8ef13b817747710eae99c1718c9be9,B0759HPKCJ,kalpit creations Men's Comfort Soft Cotton Bla...,https://images-na.ssl-images-amazon.com/images...,[kalpit creations],"[Black, Bottle, green, GREEN, NAVY, BLUE, Pink...",249.0,3.2,kalpit creations Men's Comfort Soft Cotton Bla...,"[-0.03354647383093834, 0.02292823977768421, 0...."


In [None]:
qdrant_df = q_df[['uniq_id', 'product_name','brand','medium','colour','sales_price','rating', 'tags']]
qdrant_df.columns

Index(['uniq_id', 'product_name', 'brand', 'medium', 'colour', 'sales_price',
       'rating', 'tags'],
      dtype='object')

In [None]:
len(qdrant_df['tags'][5])

384

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

# Replace these with your actual Qdrant Cloud URL and, if necessary, your access token
QDRANT_URL = 'https://d7661970-e87c-4822-bf53-c05615ccda0d.us-east4-0.gcp.cloud.qdrant.io'
ACCESS_TOKEN = 'Qj8CCo-xWqfZ27QOpiK9pxapR5Q0mDUrsmROAGYYJ83KTeF6fCjcgA'

# Create Qdrant client for the cloud instance
client = QdrantClient(url=QDRANT_URL, api_key=ACCESS_TOKEN)
# Create a collection in Qdrant
collection_name = "product_vectors_updated"
vector_size = len(qdrant_df['tags'][5])  # Size of your vectors

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)




# Assume 'dataframe' is pre-loaded and vectors are already computed
points = []
index = 0
for _, row in qdrant_df.iterrows():
    point = PointStruct(
        id=index,
        payload={
            'uniq_id': row['uniq_id'],
            'product_name': row['product_name'],
            'brand': row['brand'],
            'medium': row['medium'],
            'colour': row['colour'],
            'sales_price': row['sales_price'],
            'rating': row['rating']
        },
        vector=row['tags']
    )
    points.append(point)
    index = index+1

# Batch upload points to the collection
client.upsert(collection_name=collection_name, points=points)

  client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
hits = client.search(
    collection_name="product_vectors_updated",
    query_vector=encoder.encode("Miss Chase Women's Solid Shoulder Cut-Out Half-Sleeve Round Neck Mini Dresses").tolist(),
    limit=5,
)
for hit in hits:
    print(hit.id, "score:", hit.score)

    print(qdrant_df.iloc[hit.id]['product_name'])
    print(qdrant_df.iloc[hit.id]['brand'])
    print(qdrant_df.iloc[hit.id]['sales_price'])
    print(qdrant_df.iloc[hit.id]['colour'])
    print('-------------------------------------')
    print('\n')

2 score: 0.7144219
Miss Chase Women's Solid Shoulder Cut-Out Half-Sleeve Round Neck Mini Dresses
['Miss Chase']
669.00
['Black', 'Maroon']
-------------------------------------


2805 score: 0.6413149
Miss Chase Women's Black and White Round Neck 3/4 Sleeves Mini Dress
['Miss Chase']
399.00
['Black', 'Black', 'and', 'White', 'Multi']
-------------------------------------


3657 score: 0.51426816
Style Eva - Che Guevara Printed T-Shirts Collections for Men and Women, Code 63 - Round Neck with Half Sleeves
['Style Eva']
399.00
['white', 'roundneck-halfsleeves', 'White']
-------------------------------------


2854 score: 0.50308913
GOLDEN GIRL Women's Cotton and Polyester Backless Cross Stripe Padded T-Shirt Bra(Size: 28-36 Bust Size, Black)
['Golden Girl']
365.00
['Black', 'Skin']
-------------------------------------


3813 score: 0.49957097
T-identi-T Women's Cotton Round Neck Half Sleeves T-Shirt
['T-identi-T']
375.00
['White', 'Yellow']
-------------------------------------




In [None]:
output_pickle_file = 'qdrant_data.pkl'
save_similar_products_to_pickle(qdrant_df, output_pickle_file)

Pickle file 'qdrant_data.pkl' created successfully!


In [None]:

def get_product_details(qdrant_df, hit, similar_products):
  """
  This function takes a hit from a Qdrant search and returns a dictionary containing the brand, colour, product_name, selling_price and medium for the queried product and similar products.

  Args:
      qdrant_df: DataFrame containing product information.
      hit: A hit object from a Qdrant search.
      similar_products: A list of similar products returned by the search.

  Returns:
      A dictionary containing the product details.
  """

  product_details = {}

  # Get details for the queried product
  product_details["queried_product"] = {
      "brand": qdrant_df.iloc[hit.id]["brand"],
      "colour": qdrant_df.iloc[hit.id]["colour"],
      "product_name": qdrant_df.iloc[hit.id]["product_name"],
      "selling_price": qdrant_df.iloc[hit.id]["sales_price"],
      "medium": qdrant_df.iloc[hit.id]["medium"],
  }

  # Get details for similar products
  product_details["similar_products"] = []
  for similar_product in similar_products:
    product_details["similar_products"].append({
      "brand": qdrant_df.iloc[similar_product.id]["brand"],
      "colour": qdrant_df.iloc[similar_product.id]["colour"],
      "product_name": qdrant_df.iloc[similar_product.id]["product_name"],
      "selling_price": qdrant_df.iloc[similar_product.id]["sales_price"],
      "medium": qdrant_df.iloc[similar_product.id]["medium"],
    })

  return product_details

# Example usage
hit = hits[0]
similar_products = hits[1:]
product_details = get_product_details(qdrant_df, hit, similar_products)

# Print the product details
print("Queried Product:")
print(product_details["queried_product"])

print("\nSimilar Products:")
for product in product_details["similar_products"]:
  print(product)
