In [1]:
"""
This program extracts product information from a website and finds similar products.
Author: [Suhail Parakkal]
Date: [29-04-2023]
"""

'\nThis program extracts product information from a website and finds similar products.\nAuthor: [Suhail Parakkal]\nDate: [29-04-2023]\n'

## **Import Libraries**

The below two cells are installing required libraries and downloading required files. Please run these first time. After running once comment these for better performance.

In [2]:
# Run this cell only the first time. 
# After running once comment the below code for fast response
#!pip install -r 'https://raw.githubusercontent.com/suhail767/similarityFinder/main/requirements.txt'


In [3]:
# Run this cell only the first time. 
# After running once comment the below code for fast response
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2023-05-17 05:20:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-05-17 05:20:42--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-05-17 05:20:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [4]:
#!python -m spacy download en_core_web_md


In [5]:
#!pip install keras tensorflow


In [6]:
import pandas as pd
import numpy as np
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from sklearn.cluster import KMeans
import requests
import random
import json
from PIL import Image
import requests
from io import BytesIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


# Extract Data from website Json file

In [7]:

def get_all_products(url):
    all_products = []
    page = 1
    while True:
        response = requests.get(f"{url}?page={page}")
        if response.ok:
            products = json.loads(response.content)["products"]
            if not products:
                break
            all_products.extend(products)
            page += 1
        else:
            break

    print(f"Retrieved {len(all_products)} products from {url}.")
    return all_products


# Provide the inputs for the program here

In [8]:
# Set the base URL for the website
base_url = 'https://www.woolsboutiqueuomo.com//'
data_url = "https://www.woolsboutiqueuomo.com/collections/all/products.json"

all_products = get_all_products(data_url)

# Save the combined dataset as a JSON file
with open("productInfo.json", "w") as f:
    json.dump(all_products, f)


Retrieved 669 products from https://www.woolsboutiqueuomo.com/collections/all/products.json.


# Load Dataset

In [9]:
# Load data from JSON file
with open('productInfo.json', 'r') as f:
    data = json.load(f)

# Create a list of dictionaries for each product with required features
products = []
for p in data:
    variants = p.get('variants', [])  # Get the variants, or an empty list if not available
    for v in variants:
        product_dict = {'product_id': str(p['id']), 'title': p['title'],
                        'vendor': p['vendor'], 'product_type': p['product_type'],
                        'tags': p['tags'], 'handle': p['handle'],
                        'images': p['images'], 'price': v.get('price'), 'body_html': p['body_html']} 

        products.append(product_dict)

# Print the list of product dictionaries
print(len(products))
print(products[:3])


1284
[{'product_id': '6812735111215', 'title': '19andreas47 Handmade Cashmere Shawl Plaid Multicolor', 'vendor': "19 ANDREA'S 47", 'product_type': 'Scarves', 'tags': [], 'handle': 'cashmere-shrug-stole-shawl-multicolor', 'images': [{'id': 30455829889071, 'created_at': '2021-12-10T01:02:35+01:00', 'position': 1, 'updated_at': '2021-12-10T01:02:37+01:00', 'product_id': 6812735111215, 'variant_ids': [], 'src': 'https://cdn.shopify.com/s/files/1/0527/9877/products/19andreas47-cashmere-shawl-plaid-multicolor_01.jpg?v=1639094557', 'width': 800, 'height': 800}, {'id': 30455829856303, 'created_at': '2021-12-10T01:02:35+01:00', 'position': 2, 'updated_at': '2021-12-10T01:02:39+01:00', 'product_id': 6812735111215, 'variant_ids': [], 'src': 'https://cdn.shopify.com/s/files/1/0527/9877/products/19andreas47-cashmere-shawl-plaid-multicolor_03.jpg?v=1639094559', 'width': 800, 'height': 800}, {'id': 30455829921839, 'created_at': '2021-12-10T01:02:35+01:00', 'position': 3, 'updated_at': '2021-12-10T01:

# Dataframe creation and preprocessing

In [10]:
# Create a pandas dataframe from products list
df = pd.DataFrame(products)

df.head(10)

Unnamed: 0,product_id,title,vendor,product_type,tags,handle,images,price,body_html
0,6812735111215,19andreas47 Handmade Cashmere Shawl Plaid Mult...,19 ANDREA'S 47,Scarves,[],cashmere-shrug-stole-shawl-multicolor,"[{'id': 30455829889071, 'created_at': '2021-12...",1250.0,"<meta charset=""utf-8"">\n<p data-mce-fragment=""..."
1,6812735275055,19andreas47 Handmade Cashmere Shawl Plaid Roller,19 ANDREA'S 47,Scarves,[],handamde-cashmere-scarf-19andreas47,"[{'id': 30455886512175, 'created_at': '2021-12...",1250.0,"<meta charset=""utf-8"">\n<p data-mce-fragment=""..."
2,6813243408431,Amelie Flowers Cashmere Scarf Blue,19 ANDREA'S 47,Scarves,[],amelie-flowers-cashmere-scarf,"[{'id': 30426340163631, 'created_at': '2021-12...",460.0,"<meta charset=""utf-8"">\n<p data-mce-fragment=""..."
3,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."
4,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."
5,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."
6,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."
7,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."
8,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."
9,8392243675465,Borriello Dark Blue Classic Denim Shirt,BORRIELLO,Shirts,[WHATS NEW],borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION..."


In [11]:
# Group the DataFrame by product identifier
grouped = df.groupby('product_id')

# Iterate over each group
for product_id, group in grouped:
    # Get the unique prices within the group
    unique_prices = group['price'].unique()

    # Check if there are different prices among the variants
    if len(unique_prices) > 1:
        print(f"Product ID: {product_id}")
        print("Different prices among variants:")
        for variant_price in unique_prices:
            print(f"Price: {variant_price}")
        print()

# Check if different prices are found
if not any(len(group['price'].unique()) > 1 for _, group in grouped):
    print("No products with different prices among variants found.")


No products with different prices among variants found.


In [12]:
df.shape

(1284, 9)

In [13]:

# drop duplicates based on product_id
df.drop_duplicates(subset='product_id', keep='first', inplace=True)
# remove any NaN values
df = df.dropna()
# convert 'tags' column to string type
df['tags'] = df['tags'].astype(str)

# Preprocess the price data
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])
df = df[df['price'] > 0]

df.shape

(669, 9)

In [14]:
# Feature Engineering
df['price_feature'] = df['price']
scaler = MinMaxScaler()
df['price_feature'] = scaler.fit_transform(df[['price_feature']])

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Define stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text data
def clean_text(text):
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # Lemmatize words
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())

    return text

# Apply text cleaning to the title and tags columns
df['title'] = df['title'].apply(clean_text)
df['tags'] = df['tags'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [16]:
# Add URL to product and URL to image columns to the DataFrame
df['product_url'] = df.apply(lambda row: f"{base_url}product/{row['handle']}", axis=1)
df['image_url'] = df.apply(lambda row: row['images'][0]['src'] if row['images'] else None, axis=1)


In [17]:
import re

def extract_color(text):
    colors = ['red', 'blue', 'green', 'white', 'black', 'yellow', 'orange', 'purple']
    matches = re.findall(r'\b(?:{})\b'.format('|'.join(colors)), text, flags=re.IGNORECASE)
    if matches:
        return matches[0]
    else:
        return None

def extract_material(text):
    materials = ['cotton', 'nylon', 'leather', 'silk', 'wool', 'polyester']
    matches = re.findall(r'\b(?:{})\b'.format('|'.join(materials)), text, flags=re.IGNORECASE)
    if matches:
        return matches[0]
    else:
        return None

def extract_stitch(text):
    stitches = ['handmade', 'machine-made', 'sewn', 'knitted']
    matches = re.findall(r'\b(?:{})\b'.format('|'.join(stitches)), text, flags=re.IGNORECASE)
    if matches:
        return matches[0]
    else:
        return None

def extract_fit(text):
    fits = ['regular', 'slim', 'loose', 'tight']
    matches = re.findall(r'\b(?:{})\b'.format('|'.join(fits)), text, flags=re.IGNORECASE)
    if matches:
        return matches[0]
    else:
        return None

# Apply the extraction functions to the "body_html" field
df['color'] = df['body_html'].apply(extract_color)
df['material'] = df['body_html'].apply(extract_material)
df['stitch'] = df['body_html'].apply(extract_stitch)
df['fit'] = df['body_html'].apply(extract_fit)


In [18]:
df.head()

Unnamed: 0,product_id,title,vendor,product_type,tags,handle,images,price,body_html,price_feature,product_url,image_url,color,material,stitch,fit
0,6812735111215,19andreas47 handmade cashmere shawl plaid mult...,19 ANDREA'S 47,Scarves,,cashmere-shrug-stole-shawl-multicolor,"[{'id': 30455829889071, 'created_at': '2021-12...",1250.0,"<meta charset=""utf-8"">\n<p data-mce-fragment=""...",0.638418,https://www.woolsboutiqueuomo.com//product/cas...,https://cdn.shopify.com/s/files/1/0527/9877/pr...,,,handmade,
1,6812735275055,19andreas47 handmade cashmere shawl plaid roller,19 ANDREA'S 47,Scarves,,handamde-cashmere-scarf-19andreas47,"[{'id': 30455886512175, 'created_at': '2021-12...",1250.0,"<meta charset=""utf-8"">\n<p data-mce-fragment=""...",0.638418,https://www.woolsboutiqueuomo.com//product/han...,https://cdn.shopify.com/s/files/1/0527/9877/pr...,blue,,handmade,
2,6813243408431,amelie flower cashmere scarf blue,19 ANDREA'S 47,Scarves,,amelie-flowers-cashmere-scarf,"[{'id': 30426340163631, 'created_at': '2021-12...",460.0,"<meta charset=""utf-8"">\n<p data-mce-fragment=""...",0.19209,https://www.woolsboutiqueuomo.com//product/ame...,https://cdn.shopify.com/s/files/1/0527/9877/pr...,blue,,Handmade,
3,8392243675465,borriello dark blue classic denim shirt,BORRIELLO,Shirts,whats new,borriello-dark-blue-classic-denim-shirt,"[{'id': 49402581188937, 'created_at': '2023-04...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION...",0.045198,https://www.woolsboutiqueuomo.com//product/bor...,https://cdn.shopify.com/s/files/1/0527/9877/fi...,blue,cotton,,regular
11,8402364531017,borriello light blue white striped shirt popli...,BORRIELLO,Shirts,whats new,borriello-light-blue-white-striped-shirt,"[{'id': 49541847220553, 'created_at': '2023-05...",200.0,"<meta charset=""utf-8"">\n<p><strong>DESCRIPTION...",0.045198,https://www.woolsboutiqueuomo.com//product/bor...,https://cdn.shopify.com/s/files/1/0527/9877/fi...,blue,cotton,,regular


In [19]:
# Load VGG model for image processing
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [20]:
import concurrent.futures
from functools import partial

# Function to resize images
def resize_image(img, size=(224, 224)):
    img = img.resize(size)
    return img

# Modified get_image_features function to include image resizing
def get_image_features(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))
        img = resize_image(img)
        img_data = np.array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        img_features = model.predict(img_data)
        img_features = img_features.flatten()
        return img_features
    except:
        # Return a placeholder feature vector if image processing fails
        return np.zeros((4096,))

# Drop rows with no image URL
df = df.dropna(subset=['image_url'])

# Extract features from images in parallel
image_urls = df['image_url'].tolist()
with concurrent.futures.ThreadPoolExecutor() as executor:
    image_features = list(executor.map(partial(get_image_features), image_urls))

# Update the dataframe with image features
df['image_features'] = image_features





# Generate Word embeddings

In [21]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.word2vec.txt'
glove2word2vec(glove_file, word2vec_output_file)

from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)


  glove2word2vec(glove_file, word2vec_output_file)


In [22]:
# Function to generate word embeddings for a text string
def get_word_embeddings(text):
    # Split text into individual words
    words = text.split()

    # Initialize an empty array for storing the embeddings
    embeddings = []

    # Iterate over each word in the text
    for word in words:
        # Check if the word is in the vocabulary
        if word in word_vectors.key_to_index:
            # If the word is in the vocabulary, add its embedding to the list
            embeddings.append(word_vectors[word])

    # If no valid embeddings were found, return None
    if len(embeddings) == 0:
        return None

    # Otherwise, calculate the mean of the embeddings and return the result
    return np.mean(embeddings, axis=0)

# Generate word embeddings for each product
df['title_vec'] = df['title'].apply(get_word_embeddings)
df['tag_vec'] = df['tags'].apply(get_word_embeddings)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between the specified product and all other products based on title
title_similarities = cosine_similarity(df['title_vec'].tolist())

# Calculate the cosine similarity between the specified product and all other products based on tags
tag_similarities = []
for i, emb1 in enumerate(df['tag_vec']):
    if isinstance(emb1, np.ndarray) and not np.isnan(emb1).any():
        sim_row = []
        for j, emb2 in enumerate(df['tag_vec']):
            if i != j and isinstance(emb2, np.ndarray) and not np.isnan(emb2).any():
                similarity = cosine_similarity([emb1], [emb2])[0][0]
                sim_row.append(similarity)
        tag_similarities.append(sim_row)

tag_similarities = np.array(tag_similarities)



# Calculate the cosine similarity between the specified product and all other products based on images
image_similarities = cosine_similarity(df['image_features'].tolist())

price_features = np.array(df['price_feature'].tolist())
price_features = np.expand_dims(price_features, axis=1)  # Reshape to a 2D array
# Calculate the cosine similarity between the specified product and all other products based on price
price_similarities = cosine_similarity(price_features)

# Resize the matrices to ensure compatibility for element-wise addition
title_similarities_resized = np.resize(title_similarities, (len(df), len(df)))
tag_similarities_resized = np.resize(tag_similarities, (len(df), len(df)))
image_similarities_resized = np.resize(image_similarities, (len(df), len(df)))
price_similarities_resized = np.resize(price_similarities, (len(df), len(df)))
# Combine the similarities from different features
total_similarities = (0.4 * title_similarities_resized) + (0.3 * tag_similarities_resized) + (0.2 * image_similarities_resized) + (0.1 * price_similarities_resized)
# Optionally, you can normalize the total similarities to have values between 0 and 1
total_similarities_normalized = (total_similarities - total_similarities.min()) / (total_similarities.max() - total_similarities.min())


In [None]:
def apply_clustering(total_similarities_normalized, n_clusters=5):
    # Applying KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(total_similarities)

    return kmeans.labels_


In [None]:
df['cluster'] = apply_clustering(total_similarities_normalized)


In [None]:
import matplotlib.pyplot as plt

def plot_elbow(data, min_clusters, max_clusters):
    wcss = []
    for n_clusters in range(min_clusters, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    plt.plot(range(min_clusters, max_clusters + 1), wcss)
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.title('Elbow Method')
    plt.show()


In [None]:
min_clusters = 2
max_clusters = 10
plot_elbow(total_similarities, min_clusters, max_clusters)


In [None]:
def evaluate_clusters_silhouette(data, min_clusters, max_clusters):
    silhouette_scores = []
    for n_clusters in range(min_clusters, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        silhouette_scores.append((n_clusters, score))
    return silhouette_scores


In [None]:
min_clusters = 2
max_clusters = 10
cluster_scores = evaluate_clusters_silhouette(total_similarities, min_clusters, max_clusters)


In [None]:
for n_clusters, score in cluster_scores:
    print(f"Number of clusters: {n_clusters}, Silhouette score: {score}")


# Get similarity score by calculating Cosine Similarity

In [None]:
def get_similar_products(product_id, num_similar=5, similarity_threshold=0.5):
    # Get the embedding vector for the specified product
    embedding = df.loc[df['product_id'] == product_id, 'title_vec'].values[0]
    
    # Calculate the cosine similarity between the specified product and all other products
    similarities = cosine_similarity(df['title_vec'].tolist(), [embedding])
    dissimilarities = 1 - similarities
    # Get the indices of the most similar products
    similar_indices = np.argsort(similarities.ravel())[::-1]

    # Exclude the specified product from the list of similar products
    selected_index = df[df['product_id'] == product_id].index[0]
    similar_indices = np.delete(similar_indices, np.where(similar_indices == selected_index))[:num_similar]

    # Get the details of the specified product
    product = df.loc[df['product_id'] == product_id].iloc[0]
    
    print(f"\033[1mSelected Product:\033[0m")
    print(f"Product ID: {product['product_id']}")
    print(f"Title: {product['title']}")
    print(f"Product Type: {product['product_type']}")
    print(f"Vendor: {product['vendor']}")
    print(f"Tags: {product['tags']}")
    print(f"Product URL: {product['product_url']}")
    print(f"Image URL: {product['image_url']}")
    
    # Open the image and resize it
    response = requests.get(product['image_url'])
    img = Image.open(BytesIO(response.content))
    img = img.resize((300, 300))  # Change the dimensions as per your requirement
    img.show()
    
    print("\n******************************\n")
    print("\033[1mSimilar Products:\033[0m")
    for i in similar_indices:
        similarity_score = similarities[i][0]
        if similarity_score >= similarity_threshold:
            product = df.iloc[i]
            if df.iloc[i]['cluster'] == df.loc[df['product_id'] == product_id, 'cluster'].values[0]:
                print(f"\nProduct ID: {product['product_id']}")
                print(f"Title: {product['title']}")
                print(f"Product Type: {product['product_type']}")
                print(f"Vendor: {product['vendor']}")
                print(f"Tags: {product['tags']}")
                print(f"Product URL: {product['product_url']}")
                print(f"Image URL: {product['image_url']}")
                print(f"Cluster: {product['cluster']}")
                
                print("\nDissimilarities:")
                dissimilarity_info = []
                if df.loc[df['product_id'] == product_id, 'color'].values[0] != product['color']:
                    dissimilarity_info.append(f"Color: {product['color']}")
                if df.loc[df['product_id'] == product_id, 'material'].values[0] != product['material']:
                    dissimilarity_info.append(f"Material: {product['material']}")
                if df.loc[df['product_id'] == product_id, 'stitch'].values[0] != product['stitch']:
                    dissimilarity_info.append(f"Stitch: {product['stitch']}")
                if df.loc[df['product_id'] == product_id, 'fit'].values[0] != product['fit']:
                    dissimilarity_info.append(f"Fit: {product['fit']}")

                if len(dissimilarity_info) == 0:
                    print("None")
                else:
                    print(", ".join(dissimilarity_info))

            
                print(f"Similarity score: {similarity_score:.4f}")

                # Open the image and resize it
                response = requests.get(product['image_url'])
                img = Image.open(BytesIO(response.content))
                img = img.resize((300, 300))  # Change the dimensions as per your requirement
                img.show()

                cluster_label = product['cluster']
                cluster_products = df[df['cluster'] == cluster_label]
                print("\n\033[1mOther products in the same cluster:\033[0m")
                count = 0
                for _, cluster_product in cluster_products.iterrows():
                    if cluster_product['product_id'] != product_id:
                        print(f"\nProduct ID: {cluster_product['product_id']}")
                        print(f"Title: {cluster_product['title']}")
                        print(f"Product Type: {cluster_product['product_type']}")
                        print(f"Vendor: {cluster_product['vendor']}")
                        print(f"Tags: {cluster_product['tags']}")
                        # Open the image and resize it
                        response = requests.get(cluster_product['image_url'])
                        img = Image.open(BytesIO(response.content))
                        img = img.resize((300, 300))  # Change the dimensions as per your requirement
                        img.show()

                        dissimilarity_info = []
                        if df.loc[df['product_id'] == product_id, 'color'].values[0] != cluster_product['color']:
                            dissimilarity_info.append(f"Color: {cluster_product['color']}")
                        if df.loc[df['product_id'] == product_id, 'material'].values[0] != cluster_product['material']:
                            dissimilarity_info.append(f"Material: {cluster_product['material']}")
                        if df.loc[df['product_id'] == product_id, 'stitch'].values[0] != cluster_product['stitch']:
                            dissimilarity_info.append(f"Stitch: {cluster_product['stitch']}")
                        if df.loc[df['product_id'] == product_id, 'fit'].values[0] != cluster_product['fit']:
                            dissimilarity_info.append(f"Fit: {cluster_product['fit']}")

                        if len(dissimilarity_info) > 0:
                            print("\nDissimilarities:")
                            print(", ".join(dissimilarity_info))
                        else:
                            print("\nNo dissimilarities")

                        count += 1
                        if count >= 3:
                            print("\nThese three products above are from the same cluster\n")
                            print("*****************************************************")
                            break
            print()
  

# Find similar products for a Product ID based on the Similarity score

In [None]:
# Get a random product ID from the df['product_id'] column
product_identifier = random.choice(df['product_id'].values)

if product_identifier not in df['product_id'].values:
    print(f"The product ID {product_identifier} does not exist in the DataFrame.")
else:
    get_similar_products(product_identifier, num_similar=5)
