# Load data

In [179]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Load the CSV data into a DataFrame
df = pd.read_csv("data/skincare.csv")

# Cluster based on secondary category

In [180]:
secondary_category = set(df['secondary_category'])

category_items_dict = {}
for category in secondary_category:
   items = list(df[df['secondary_category'] == category]['product_id'])
   category_items_dict[category] = items

# Get similar items based on the highlights

In [204]:
def jaccard_similarity(set_a, set_b):
   """
   Calculate the Jaccard Similarity between two sets.
   """
   intersection = len(set_a.intersection(set_b))
   union = len(set_a.union(set_b))
   return intersection / union if union != 0 else 0.0

def similarity_search(df, token_list):
   """
   Perform similarity search based on Jaccard similarity between IDs and a token list.
   
   :param df: A pandas DataFrame with columns ['id', 'tokens'].
   :param token_list: A list of tokens to compare against.
   :return: A DataFrame with IDs and their Jaccard similarity scores, sorted by similarity score.
   """
   # Convert the token list to a set
   token_set = set(token_list)
   
   # List to store the similarity scores
   similarity_scores = []
   
   # Iterate over the rows of the DataFrame
   for index, row in df.iterrows():
      product_id = row['product_id']
      title = row['product_name']
      # Convert the tokens for this ID to a set (assumed to be a string)
      id_token_set = set(row['highlights'].split(", "))
      
      # Calculate Jaccard similarity
      similarity_score = jaccard_similarity(id_token_set, token_set)
      
      # Append the result as a tuple (id, score)
      similarity_scores.append((product_id, title, similarity_score, (row['highlights'])))
   
   # Convert the list of similarity scores to a DataFrame
   similarity_df = pd.DataFrame(similarity_scores, columns=['product_id', 'product_name', 'similarity_score', 'highlights'])
   
   # Sort the DataFrame by the 'similarity_score' column in descending order
   similarity_df_sorted = similarity_df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
   
   return similarity_df_sorted

In [215]:
# for now we are taking the eye cream
items = category_items_dict['Eye Care']

# item I am looking for (now we only took the first one later we shall do the real one)
my_item = items[0]

# here I have remove the item I am searching for
filtered_df = df[df['product_id'].isin(items[1:])][['product_id','product_name', 'highlights']]

# The token list to compare against
token_list = list(df[df['product_id'] == my_item]['highlights'])[0].split(", ")

# Perform similarity search and get sorted results
similarity_results_sorted = similarity_search(filtered_df, token_list)

print("my highlights:", token_list)

# Output sorted similarity scores
my_sim_items = similarity_results_sorted[['similarity_score', 'highlights']]

print(similarity_results_sorted)


my highlights: ['Vegan', 'Collagen', 'Hypoallergenic', 'Loss of firmness', 'Dry', 'Combo', 'Normal']
    product_id                                       product_name  \
0      P480986                    Firming Eye Cream with Peptides   
1      P480192           Light Aura Vitamin C + Peptide Eye Cream   
2      P475201  24-7 Power Swipe Hydrating Day & Night Treatme...   
3      P383053  Do Not Age with Dr. Brandt Triple Peptide Eye ...   
4      P461165          GOOPGENES All-In-One Nourishing Eye Cream   
..         ...                                                ...   
141    P502758          Super-C Dark Circle Brightening Eye Serum   
142    P443840                                 Caffeine Eye Cream   
143    P449410                Boosted Contouring Retinol Eye Balm   
144    P448802                               Brighten-i Eye Cream   
145    P501199                       YUZU-C Eye Awakening Essence   

     similarity_score                                         highligh

In [None]:
# insert product id
