# Load data

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Load the CSV data into a DataFrame
df = pd.read_csv("data/skincare.csv")

# Cluster based on secondary category

In [2]:
secondary_category = set(df['secondary_category'])

category_items_dict = {}
for category in secondary_category:
   items = list(df[df['secondary_category'] == category]['product_id'])
   category_items_dict[category] = items

# Get similar items based on the highlights

In [3]:
def jaccard_similarity(set_a, set_b):
   """
   Calculate the Jaccard Similarity between two sets.
   """
   intersection = len(set_a.intersection(set_b))
   union = len(set_a.union(set_b))
   return intersection / union if union != 0 else 0.0

def similarity_search(df, token_list):
   """
   Perform similarity search based on Jaccard similarity between IDs and a token list.
   
   :param df: A pandas DataFrame with columns ['id', 'tokens'].
   :param token_list: A list of tokens to compare against.
   :return: A DataFrame with IDs and their Jaccard similarity scores, sorted by similarity score.
   """
   # Convert the token list to a set
   token_set = set(token_list)
   
   # List to store the similarity scores
   similarity_scores = []
   
   # Iterate over the rows of the DataFrame
   for index, row in df.iterrows():
      product_id = row['product_id']
      title = row['product_name']
      # Convert the tokens for this ID to a set (assumed to be a string)
      id_token_set = set(row['highlights'].split(", "))
      
      # Calculate Jaccard similarity
      similarity_score = jaccard_similarity(id_token_set, token_set)
      
      # Append the result as a tuple (id, score)
      similarity_scores.append((product_id, title, similarity_score, (row['highlights'])))
   
   # Convert the list of similarity scores to a DataFrame
   similarity_df = pd.DataFrame(similarity_scores, columns=['product_id', 'product_name', 'similarity_score', 'highlights'])
   
   # Sort the DataFrame by the 'similarity_score' column in descending order
   similarity_df_sorted = similarity_df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
   
   return similarity_df_sorted

In [6]:
# for now we are taking the eye cream
items = category_items_dict['Eye Care']

# item I am looking for (now we only took the first one later we shall do the real one)
my_item = items[0]

# here I have remove the item I am searching for, this is where I take the product of certian secondary kind
#filtered_df = df[df['product_id'].isin(items[1:])][['product_id','product_name', 'highlights']]
filtered_df = df[['product_id','product_name', 'highlights']]
print(len(filtered_df))


# The token list to compare against
token_list = list(df[df['product_id'] == my_item]['highlights'])[0].split(", ")

# Perform similarity search and get sorted results
similarity_results_sorted = similarity_search(filtered_df, token_list)

print("my highlights:", token_list)

# Output sorted similarity scores
my_sim_items = similarity_results_sorted[['similarity_score', 'highlights']]

print(similarity_results_sorted)


1927
my highlights: ['Vegan', 'Collagen', 'Hypoallergenic', 'Loss of firmness', 'Dry', 'Combo', 'Normal']
     product_id                                 product_name  \
0       P388262         GENIUS Ultimate Anti-Aging Eye Cream   
1       P392945  GENIUS Ultimate Anti-Aging Vitamin C+ Serum   
2       P384537             GENIUS Ultimate Anti-Aging Cream   
3       P421277                 GENIUS Liquid Collagen Serum   
4       P439055         GENIUS Sleeping Collagen Moisturizer   
...         ...                                          ...   
1922    P479633               Pore Remedy Purifying Mud Mask   
1923    P474371               Evercalm Gentle Cleansing Milk   
1924    P504056  Perfect Canvas Smooth, Prep & Plump Essence   
1925    P470049           Clearcalm Clarifying Clay Cleanser   
1926    P455612     Vitamin C & Bearberry Instant Glow Serum   

      similarity_score                                         highlights  
0                1.000  Vegan, Collagen, Hypoalle

In [22]:
from datasketch import MinHash, MinHashLSH

highlights = df[['product_id', 'highlights']]

chosen_highlight_row = list(df[df['product_id'] == my_item]['highlights'])[0].split(", ")

def crate_mihashes(highlights, threshold=0.5, num_perm=128):
   minhashes = {}
   lsh = MinHashLSH(threshold=0.5, num_perm=128)
   for _, row in highlights.iterrows():
      product_id = row['product_id']
      tokens = row['highlights'].split(", ")
      m = MinHash(num_perm=128)
      for token in tokens:
         m.update(token.encode('utf8'))
      minhashes[product_id] = m
      lsh.insert(product_id, m)
   return minhashes, lsh


minhashes, lsh = crate_mihashes(highlights)

# Query for similar items
chosen_minhash = minhashes[my_item]
similar_product_ids = lsh.query(chosen_minhash)
similar_products = df[df['product_id'].isin(similar_product_ids)]['product_id']
print(f"Products similar to the chosen one (ID: {my_item}):\n{similar_products}")

Products similar to the chosen one (ID: P388262):
0       P439055
1       P421277
2       P432045
3       P311143
4       P384537
6       P392945
9       P453818
11      P388262
13      P447504
15      P456990
16      P504443
31      P503197
354     P453822
569     P503809
844     P455610
1024    P481825
1586    P480192
Name: product_id, dtype: object


In [None]:
items_in_both = set(list(similar_products)).intersection(set(list(similarity_results_sorted[['product_id']])))


print(items_in_both)
print(my_item)

set()
P388262
