In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Define functions

In [48]:
def jaccard_similarity(set_a, set_b):
   """
   Calculate the Jaccard Similarity between two sets.
   """
   intersection = len(set_a.intersection(set_b))
   union = len(set_a.union(set_b))
   return intersection / union if union != 0 else 0.0

def similarity_search_highlights(df, token_list):
   """
   Perform similarity search based on Jaccard similarity between df and a token list.
   
   :param df: A pandas DataFrame with columns ['product_id','product_name', 'highlights'].
   :param token_list: A list of tokens to compare against (highlights).
   :return: A DataFrame with IDs and their Jaccard similarity scores, sorted by similarity score.
   """
   # Convert the token list to a set
   token_set = set(token_list)
   
   # List to store the similarity scores
   similarity_scores = []
   
   # Iterate over the rows of the DataFrame
   for index, row in df.iterrows():
      product_id = row['product_id']
      title = row['product_name']
      # Convert the tokens for this ID to a set (assumed to be a string)
      id_token_set = set(row['highlights'].split(", "))
      
      # Calculate Jaccard similarity
      similarity_score = jaccard_similarity(id_token_set, token_set)
      
      # Append the result as a tuple (id, score)
      similarity_scores.append((product_id, title, similarity_score, (row['highlights'])))
   
   # Convert the list of similarity scores to a DataFrame
   similarity_df = pd.DataFrame(similarity_scores, columns=['product_id', 'product_name', 'similarity_score_highlights', 'highlights'])
   
   # Sort the DataFrame by the 'similarity_score' column in descending order
   similarity_df_sorted = similarity_df.sort_values(by='similarity_score_highlights', ascending=False).reset_index(drop=True)
   
   similarity_df_sorted['rank_highlights'] = range(1, len(similarity_df_sorted) + 1)
   
   return similarity_df_sorted

def custom_tokenizer(text):
   return text.split(", ")

def similarity_search_ingredients(df, query):
   """
   Perform a similarity search based on cosine similarity of TF-IDF vectors.

   Parameters:
   - query (str): The input query string.
   - df (pd.DataFrame): A DataFrame containing 'id' and 'ingredients' columns.

   Returns:
   - results_df (pd.DataFrame): Rows from the original DataFrame sorted by similarity.
   """
   # Initialize TfidfVectorizer with a custom tokenizer (adjust lowercase as needed)
   vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, lowercase=False)

   # Extract the 'ingredients' column
   ingredients = df['ingredients']

   # Fit and transform the ingredients
   tfidf_matrix = vectorizer.fit_transform(ingredients)

   # Transform the query into the TF-IDF space
   query_tfidf = vectorizer.transform([query])

   # Compute cosine similarity between the query and all documents
   similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

   # Add similarity scores to the DataFrame
   df['similarity_score_ingredients'] = similarities

   # Sort the DataFrame by similarity scores in descending order
   results_df = df.sort_values(by='similarity_score_ingredients', ascending=False).reset_index(drop=True)
   
   results_df['rank_ingredients'] = range(1, len(results_df) + 1)

   return results_df

def reciprocal_rank_fusion(df_highlights, df_ingredients, k=60):
   """
   Compute Reciprocal Rank Fusion (RRF) scores based on rank_highlights and rank_ingredients.

   Parameters:
   - df_highlights (pd.DataFrame): DataFrame containing 'product_id', 'rank_highlights', and other relevant columns.
   - df_ingredients (pd.DataFrame): DataFrame containing 'product_id', 'rank_ingredients', and other relevant columns.
   - k (int): A constant for RRF computation (default=60).

   Returns:
   - combined_df (pd.DataFrame): A new DataFrame with overall RRF scores and combined ranking.
   """
   # Merge the two DataFrames on 'product_id'
   merged_df = pd.merge(
      df_highlights,  # Include all columns from df_highlights
      df_ingredients[['product_id', 'rank_ingredients']],  # Include only product_id and rank_ingredients
      on='product_id',
      how='inner'
   )

   # Fill missing ranks with a large value (e.g., very low relevance)
   merged_df['rank_highlights'] = merged_df['rank_highlights'].fillna(float('inf'))
   merged_df['rank_ingredients'] = merged_df['rank_ingredients'].fillna(float('inf'))

   # Compute the RRF score
   merged_df['rrf_score'] = (
      1 / (k + merged_df['rank_highlights']) +
      1 / (k + merged_df['rank_ingredients'])
   )

   # Sort by the RRF score in descending order
   merged_df = merged_df.sort_values(by='rrf_score', ascending=False).reset_index(drop=True)

   # Add a new rank based on the RRF score
   merged_df['overall_rank'] = range(1, len(merged_df) + 1)
   
   return merged_df


### Run code for specific product_id

In [49]:
product_id_input = "P432045" #input("Enter the product_id: ")

# load the data
df = pd.read_csv("processed_data/skincare.csv")

# get the selected product
product = df[df['product_id'] == product_id_input]

# get the product highlights and ingredients
product_highlights =  list(product['highlights'])[0].split(", ")
product_ingredients = str(product['ingredients'])

# remove the product I am searching for
df = df[(df['product_id'] != product_id_input)]#[['product_id','product_name', 'highlights']]

# perform similarity searches
highlights_similarity_results = similarity_search_highlights(df, product_highlights)
ingredients_similarity_results = similarity_search_ingredients(df, product_ingredients)

# combine similarity searches with reciprocal rank fusion algorithm
merged_results = reciprocal_rank_fusion(highlights_similarity_results, ingredients_similarity_results)

print(product['product_name'])
print(highlights_similarity_results['product_name'].head(5))
print(ingredients_similarity_results['product_name'].head(5))
print(merged_results['product_name'].head(5))

2    GENIUS Liquid Collagen Lip Treatment
Name: product_name, dtype: object
0    GENIUS Ultimate Anti-Aging Vitamin C+ Serum
1               GENIUS Ultimate Anti-Aging Cream
2           GENIUS Ultimate Anti-Aging Eye Cream
3                   GENIUS Liquid Collagen Serum
4           GENIUS Sleeping Collagen Moisturizer
Name: product_name, dtype: object
0                Creamy Eye Treatment with Avocado
1    Ultra Facial Moisturizing Cream with Squalane
2           Benefiance Wrinkle Smoothing Eye Cream
3             SHIKULIME Mega Hydrating Moisturizer
4                     YUZU-C Eye Awakening Essence
Name: product_name, dtype: object
0               Benefiance Wrinkle Smoothing Cream
1                Creamy Eye Treatment with Avocado
2                                    Aloe Vera Gel
3    Ultra Facial Moisturizing Cream with Squalane
4           Benefiance Wrinkle Smoothing Eye Cream
Name: product_name, dtype: object


