# Similar items based on ingredients and highlights

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Calculate similar items using ingredients
To calculate similar items using ingredients we choose to embed ingredients with TF-IDF. This was chosen because TF-IDF will take into account the uniqueness of ingredients. For example water is a frequent ingredient in the list which does not tell us a lot about the product. Where as salicylic acid does not occur as often leading to higher importance. To compute the similarity between products we choose cosine similarity because TF-IDF gives results in the form of embedding.

#### Display ingredients

In [5]:
df = pd.read_csv("processed_data/skincare.csv")
# Select only 'Name' and 'Ingredients' columns
df_selected = df[['product_name', 'ingredients']]

# Display the first 5 rows
print(df_selected.head())

                                        product_name  \
0               GENIUS Sleeping Collagen Moisturizer   
1                       GENIUS Liquid Collagen Serum   
2               GENIUS Liquid Collagen Lip Treatment   
3  SUBLIME DEFENSE Ultra Lightweight UV Defense F...   
4                   GENIUS Ultimate Anti-Aging Cream   

                                         ingredients  
0  Collagen, Water, Ethylhexyl Palmitate, Oryza S...  
1  Collagen, Water, Propanediol, Isononyl Isonona...  
2  Collagen, Water, Glycerin, Isononyl Isononanoa...  
3  Octinoxate 75%, Titanium Dioxide 2%, Zinc Oxid...  
4  Water, Caprylic/Capric Triglyceride, Hydrogena...  


#### Define functions

In [6]:
def custom_tokenizer(text):
   return text.split(", ")

def similarity_search_ingredients(df, query):
   """
   Perform a similarity search based on cosine similarity of TF-IDF vectors.

   Parameters:
   - query (str): The input query string.
   - df (pd.DataFrame): A DataFrame containing 'id' and 'ingredients' columns.

   Returns:
   - results_df (pd.DataFrame): Rows from the original DataFrame sorted by similarity.
   """
   # Initialize TfidfVectorizer with a custom tokenizer (adjust lowercase as needed)
   vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, lowercase=False)

   # Extract the 'ingredients' column
   ingredients = df['ingredients']

   # Fit and transform the ingredients
   tfidf_matrix = vectorizer.fit_transform(ingredients)

   # Transform the query into the TF-IDF space
   query_tfidf = vectorizer.transform([query])

   # Compute cosine similarity between the query and all documents
   similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

   # Add similarity scores to the DataFrame
   df['similarity_score_ingredients'] = similarities

   # Sort the DataFrame by similarity scores in descending order
   results_df = df.sort_values(by='similarity_score_ingredients', ascending=False).reset_index(drop=True)
   
   results_df['rank_ingredients'] = range(1, len(results_df) + 1)

   return results_df

### Calculate similarity using the highlights
In the dataset we choose we were not provided with description but rather highlights of a product. Those are sets of words that describe the product, so therefore there was no need to use the minhashing as the length of sets is not big to begging with. To compare the sets we choose to use Jaccard Similarity because the more highlight product have in common the more similar they are.

#### Display a few rows of highlights

In [7]:
df_selected = df[['product_name', 'highlights']]

# Display the first 5 rows
print(df_selected.head())

                                        product_name  \
0               GENIUS Sleeping Collagen Moisturizer   
1                       GENIUS Liquid Collagen Serum   
2               GENIUS Liquid Collagen Lip Treatment   
3  SUBLIME DEFENSE Ultra Lightweight UV Defense F...   
4                   GENIUS Ultimate Anti-Aging Cream   

                                          highlights  
0  Vegan, Loss of firmness, Collagen, Hypoallerge...  
1  Vegan, Loss of firmness, Collagen, Hypoallerge...  
2  Vegan, Loss of firmness, Plumping, Collagen, H...  
3  Vegan, Hypoallergenic, UV Protection, SPF, Dry...  
4  Vegan, Collagen, Hypoallergenic, Loss of firmn...  


#### Define functions

In [None]:
def jaccard_similarity(set_a, set_b):
   """
   Calculate the Jaccard Similarity between two sets.
   """
   intersection = len(set_a.intersection(set_b))
   union = len(set_a.union(set_b))
   return intersection / union if union != 0 else 0.0

def similarity_search_highlights(df, token_list):
   """
   Perform similarity search based on Jaccard similarity between df and a token list.
   
   :param df: A pandas DataFrame with columns ['product_id','product_name', 'highlights'].
   :param token_list: A list of tokens to compare against (highlights).
   :return: A DataFrame with IDs and their Jaccard similarity scores, sorted by similarity score.
   """
   # Convert the token list to a set
   token_set = set(token_list)
   
   # List to store the similarity scores
   similarity_scores = []
   
   # Iterate over the rows of the DataFrame
   for index, row in df.iterrows():
      product_id = row['product_id']
      title = row['product_name']
      # Convert the tokens for this ID to a set 
      id_token_set = set(row['highlights'].split(", "))
      
      # Calculate Jaccard similarity
      similarity_score = jaccard_similarity(id_token_set, token_set)
      
      # Append the result as a tuple (id, score)
      similarity_scores.append((product_id, title, similarity_score, (row['highlights'])))
   
   # Convert the list of similarity scores to a DataFrame
   similarity_df = pd.DataFrame(similarity_scores, columns=['product_id', 'product_name', 'similarity_score_highlights', 'highlights'])
   
   # Sort the DataFrame by the 'similarity_score' column in descending order
   similarity_df_sorted = similarity_df.sort_values(by='similarity_score_highlights', ascending=False).reset_index(drop=True)
   
   similarity_df_sorted['rank_highlights'] = range(1, len(similarity_df_sorted) + 1)
   
   return similarity_df_sorted

### Combining similarities based on highlight and ingredients
Jaccard similarity measure is in range [0, 1] and cosine similarity is in range [-1, 1] which means that we can not simply calculate the average or weighted sum. Due to that reason we choose to use reciprocal rank fusiona algorithm. To use it we need to rank our similarities tables which is already done in their respective functions. 
The **Reciprocal Rank Fusion (RRF)** score for a product _d_ is calculated as:
$$
RRF(d) = \sum_{i=1}^{N} \frac{1}{r_i(d) + k}
\text{}
$$
Where:
- $r_i (d)$ is the rank of product $d$ in the $i^{th}$ ranked list.
- $k$ is a constant (typically $k = 60$), used to prevent division by zero and to adjust the impact of higher ranks.
- $N$ is the number of ranked lists (models or sources).

The reciprocal rank is typically defined as $\frac{1}{r_i(d)}$, where higher ranks give more weight to the document.


In [9]:
def reciprocal_rank_fusion(df_highlights, df_ingredients, k=60):
   """
   Compute Reciprocal Rank Fusion (RRF) scores based on rank_highlights and rank_ingredients.

   Parameters:
   - df_highlights (pd.DataFrame): DataFrame containing 'product_id', 'rank_highlights', and other relevant columns.
   - df_ingredients (pd.DataFrame): DataFrame containing 'product_id', 'rank_ingredients', and other relevant columns.
   - k (int): A constant for RRF computation (default=60).

   Returns:
   - combined_df (pd.DataFrame): A new DataFrame with overall RRF scores and combined ranking.
   """
   # Merge the two DataFrames on 'product_id'
   merged_df = pd.merge(
      df_highlights,  # Include all columns from df_highlights
      df_ingredients[['product_id', 'rank_ingredients']],  # Include only product_id and rank_ingredients
      on='product_id',
      how='inner'
   )

   # Fill missing ranks with a large value (e.g., very low relevance)
   merged_df['rank_highlights'] = merged_df['rank_highlights'].fillna(float('inf'))
   merged_df['rank_ingredients'] = merged_df['rank_ingredients'].fillna(float('inf'))

   # Compute the RRF score
   merged_df['rrf_score'] = (
      1 / (k + merged_df['rank_highlights']) +
      1 / (k + merged_df['rank_ingredients'])
   )

   # Sort by the RRF score in descending order
   merged_df = merged_df.sort_values(by='rrf_score', ascending=False).reset_index(drop=True)

   # Add a new rank based on the RRF score
   merged_df['overall_rank'] = range(1, len(merged_df) + 1)
   
   return merged_df


### Run code for specific product_id

In [10]:
def get_similar_items(product_id, df, n = 5):
   """
   Retrieve the top N products most similar to a given product based on highlights and ingredients.

   This function takes a product ID, performs similarity searches on the product's highlights and ingredients, 
   and combines the results using a reciprocal rank fusion algorithm. It returns the top N most similar products.

   Parameters:
   ----------
   product_id : int or str
      The ID of the product for which similar items are being searched.
   n : int, optional
      The number of similar products to return. Default is 5.

   Returns:
    - merged_results (pd.DataFrame): A new DataFrame containing top n similar items
   """

   # get the selected product
   product = df[df['product_id'] == product_id]

   # get the product highlights and ingredients
   product_highlights =  list(product['highlights'])[0].split(", ")
   product_ingredients = str(product['ingredients'])

   # remove the product I am searching for
   df = df[(df['product_id'] != product_id)]

   # perform similarity searches
   highlights_similarity_results = similarity_search_highlights(df, product_highlights)
   ingredients_similarity_results = similarity_search_ingredients(df, product_ingredients)

   # combine similarity searches with reciprocal rank fusion algorithm
   merged_results = reciprocal_rank_fusion(highlights_similarity_results, ingredients_similarity_results)
   
   # Return only the top-n products
   return merged_results[:n]

In [20]:
df = pd.read_csv("processed_data/skincare.csv")
product_id = "P442001"
product_name = df[df['product_id'] == product_id]['product_name'].iloc[0]
merged = get_similar_items(product_id, df)['product_name'].head(5)

print(f"The most similar products to the {product_name} are: ")
print(merged)

The most similar products to the Hyaluronic Facial Cream are: 
0                                  Peptide Moisturizer
1    Do Not Age with Dr. Brandt Moisturizing Neck C...
2              Needles No More Wrinkle Smoothing Cream
3                         Eczema+ Dermatitis Face Balm
4                          Youth Junkie Activated Mask
Name: product_name, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity_score_ingredients'] = similarities
