In [None]:
# !pip install -U sentence-transformers

In [None]:
!pip install nltk sentence-transformers

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt

import cv2
import matplotlib.image as mpimg
import PIL
import PIL.Image

In [None]:


# Load the data
df = pd.read_csv('/kaggle/input/styles-image-path/styles_image_paths.csv', nrows=5000)

# Join relevant columns to create 'text' column
df['text'] = df['gender'] + ' ' + df['masterCategory'] + ' ' + df['subCategory'] + ' ' + df['articleType'] + ' ' + df['baseColour'] + ' ' + df['season'] + ' ' + df['usage'] + ' ' + df['productDisplayName']

# Create a new dataframe with only the 'text' column
text_df = pd.DataFrame(df['text'])
text_df.fillna("", inplace=True)



In [None]:

# Load the SentenceTransformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate sentence embeddings for the 'text' column
embeddings = model.encode(text_df['text'].tolist())



In [None]:
embedding_df = pd.DataFrame(embeddings)

# Add an 'id' column to match the original dataframe

# Save the embedding dataframe to a new csv file
embedding_df.to_csv('textual_embeddings_data.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load embeddings dataframe

# Compute pairwise cosine similarities
cosine_similarities = cosine_similarity(embedding_df)

# Find top n similar products for each product
n = 5
similar_products = {}
for i, row in enumerate(cosine_similarities):
    # Sort indices in descending order based on cosine similarity values and exclude the first index (itself)
    similar_indices = np.argsort(-row)[1:n+1]
    # Get the IDs of the top n similar products
    similar_ids = embedding_df.iloc[similar_indices].index.tolist()
    similar_products[embedding_df.index[i]] = similar_ids

# # Print the results
# for product_id, similar_ids in similar_products.items():
#     print(f"Product {product_id}: {similar_ids}")


In [None]:
similar_products[0]

In [None]:
df.iloc[similar_products[0]]

In [None]:
df.iloc[similar_products[1]]

In [None]:
def load_image(img_path, resized_fac = 0.1):
    img_object = plt.imread(img_path)
    w, h, c = img_object.shape
    resized = cv2.resize(img_object, (int(h*resized_fac), int(w*resized_fac)))
    return resized

In [None]:
def plot_image(image_id, styles_df):
    plt.imshow(load_image(styles_df.iloc[image_id]['image']))
    plt.title(styles_df.iloc[image_id]['productDisplayName'])

In [None]:
def plot_similar_images(query_id, similarity_dict, styles_df=df):
    plot_image(query_id, styles_df=styles_df)
    
    fig = plt.figure(figsize=(40, 10))
    
    pos = 1
    similar_products = similarity_dict[query_id]
    for id in similar_products:
        ax = plt.subplot(1, 5, pos)
        pos += 1
        plt.imshow(load_image(styles_df.iloc[id]['image']))
        plt.title(f"{id}", fontsize=12)
    plt.show()

In [None]:
plot_similar_images(0, similar_products, styles_df=df)

In [None]:
plot_similar_images(1, similar_products, styles_df=df)

In [None]:
plot_similar_images(2, similar_products, styles_df=df)

In [None]:
plot_similar_images(6, similar_products, styles_df=df)

In [None]:
plot_similar_images(10, similar_products, styles_df=df)

In [None]:
plot_similar_images(100, similar_products, styles_df=df)

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(
    n_components=2,
    init="random",
    random_state=0,
    learning_rate="auto",
    n_iter=300,
)

Y = tsne.fit_transform(embeddings)



In [None]:
fig = plt.figure(figsize=(10, 10))

for index_name in df.masterCategory.unique():
    plt.scatter(Y[df.masterCategory == index_name, 0], Y[df.masterCategory == index_name, 1], label=index_name, s=3)

plt.title("Master Category plotting")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10))

for index_name in df.subCategory.unique():
    plt.scatter(Y[df.subCategory == index_name, 0], Y[df.subCategory == index_name, 1], label=index_name, s=3)

plt.title("Sub Category plotting")
plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10))

for index_name in df.season.unique():
    plt.scatter(Y[df.season == index_name, 0], Y[df.season == index_name, 1], label=index_name, s=3)

plt.title("Season Category plotting")
plt.legend()
plt.show()