# Load imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data

In [2]:
products = pd.read_csv('data/skincare.csv')
products.columns

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')

# Preprocessing

### Preprocessing Ingredients Attempt #1

In [3]:
# Preprocess ingredient lists to get list of strings instead of a singular string
preprocess_1 = lambda x: x.replace(".'", ",").split(",")[1:-1]
ingredients_preprocessed_1 = [preprocess_1(ingredients) for ingredients in products['ingredients']]

### Preprocessing Ingredients Attempt #2

In [4]:
# Preprocess ingredient lists to string per product
preprocess_2 = lambda x: x.split("'")[1:-1][0]
ingredients_preprocessed_2 = [preprocess_2(ingredients) for ingredients in products['ingredients']]

In [5]:
# Check to make sure no information was lost
len(ingredients_preprocessed_2) == len(products['ingredients'])

True

In [6]:
# Add preprocessed ingredients to dataframe
products['ingredients_preprocessed'] = ingredients_preprocessed_2

### TODO: Dealing with gift sets

In [7]:
# Problem with gift sets
products[products['secondary_category'] == 'Value & Gift Sets'].iloc[55]['ingredients']

"['Rose Deep Hydration Toner:', 'Aqua (Water), Glycerin, Propanediol, 1,2-Hexanediol, Polyglycerin-3, Rosa Centifolia Flower, Rosa Damascena Flower Water, Rosa Damascena Extract, Rosa Multiflora Fruit Extract, Rosa Damascena Flower Extract, Angelica Keiskei Extract, Rosa Damascena Flower Oil, Butylene Glycol, Sodium Citrate, Citric Acid, Xanthan Gum, Sodium Hyaluronate, Caprylyl Glycol, Chlorphenesin, Sodium Benzoate, Potassium Sorbate, Citronellol, Geraniol.', 'Rose Deep Hydration Oil-Infused Serum:', 'Aqua (Water), Rosa Damascena Flower Water, Coco-Caprylate/Caprate, C9-12 Alkane, Propanediol, Ethylhexyl Palmitate, Pentylene Glycol, Glycerin, Squalane, Rosa Damascena Flower Extract, Rosa Damascena Extract, Camelina Sativa Seed Oil, Rosa Rubiginosa Seed Oil, Cucumis Sativus (Cucumber) Fruit Extract, Rosa Damascena Flower Oil, Tocopherol, Sodium Citrate, Sodium Chloride, Ethylhexylglycerin, Xanthan Gum, Citric Acid, Caprylic/Capric Tri-glyceride, Sorbitan Sesquioleate, 1,2- Hexanediol,

In [8]:
# If we remove gift sets, the number of rows left are ...
len(products[~(products['secondary_category'] == 'Value & Gift Sets')])

1762

# Similarity scores using TF-IDF

In [9]:
# Creating a TfidfVectorizer object
vectorizer = TfidfVectorizer()

### Cosine Similarity

In [10]:
# Creating a list of products and their ingredients
product_ingredients = products[["product_name","ingredients_preprocessed"]][products.ingredients_preprocessed.notnull()]
ingredients = product_ingredients["ingredients_preprocessed"]

In [11]:
# Vectorize the product ingredients
tfidf_matrix_ingredients = vectorizer.fit_transform(ingredients)

In [12]:
# Compute pairwise cosine similarity
cosine_similarity_matrix = cosine_similarity(tfidf_matrix_ingredients, tfidf_matrix_ingredients)

In [13]:
# Create dataframe for similarity matrix with product names
cosine_similarity_scores = pd.DataFrame(cosine_similarity_matrix,index=product_ingredients.product_name,columns=product_ingredients.product_name)

### Jaccard similarity

In [14]:
# Function to calculate jaccard similarity
def jaccard_similarity(set_a, set_b):

   intersection = len(set_a.intersection(set_b))
   union = len(set_a.union(set_b))
   
   return intersection / union if union != 0 else 0.0

In [15]:
def similarity_search(df, token_list):

   # Convert the token list to a set
   token_set = set(token_list)
   
   # List to store the similarity scores
   similarity_scores = []
   
   # Iterate over the rows of the DataFrame
   for index, row in df.iterrows():

      product_id = row['product_id']
      title = row['product_name']

      # Convert the tokens for this ID to a set (assumed to be a string)
      id_token_set = set(row['ingredients_preprocessed'].split(", "))
      
      # Calculate Jaccard similarity
      similarity_score = jaccard_similarity(id_token_set, token_set)
      
      # Append the result as a tuple (id, score)
      similarity_scores.append((title, similarity_score))
   
   # Convert the list of similarity scores to a DataFrame
   similarity_df = pd.DataFrame(similarity_scores, columns=['product_name', 'similarity_score'])
   
   # Sort the DataFrame by the 'similarity_score' column in descending order
   similarity_df_sorted = similarity_df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
   
   return similarity_df_sorted

### TODO: Minhashing

# Finding similarity scores for a random product

In [16]:
# Index and get a random product in the dataset 
rand_int = np.random.randint(0, products.shape[0]-1)
input_product_id = products.product_id.values[rand_int]
input_product_name = products[products["product_id"] == input_product_id]["product_name"].tolist()[0]

In [17]:
print(f"Finding similar products for: \t{input_product_name}")

Finding similar products for: 	Mini The Renewal Oil


##### Cosine similarity

In [18]:
# Create a dictionary to create a dataframe to easily index products
similar_products = cosine_similarity_scores.columns
cosine_similarity_scores_of_product = cosine_similarity_scores[cosine_similarity_scores.columns==input_product_name].values

scores_dict = {"Product": similar_products.values, "Score": cosine_similarity_scores_of_product[0]}

# Sorted products according to their similarity score
cosine_similarity_scores_sorted = pd.DataFrame.from_dict(scores_dict).sort_values("Score", ascending=False).reset_index().drop(columns="index")

In [19]:
# Top 10 products most similar to input product using cosine similarity
cosine_similarity_scores_sorted[1:11]

Unnamed: 0,Product,Score
1,The Renewal Oil,0.992454
2,The Hydrating Infused Emulsion,0.665959
3,The Revitalizing Hydrating Serum,0.658235
4,The Moisturizing Matte Lotion,0.655423
5,The Moisturizing Cool Gel Cream,0.634187
6,Mini The Moisturizing Cool Gel Cream,0.634187
7,The Concentrate Serum,0.613455
8,Mini The Concentrate Serum,0.613455
9,The Moisturizing Soft Lotion,0.6101
10,The Eye Concentrate Cream,0.609183


##### Jaccard similarity

In [20]:
# Get the product category and ingredients
token_list = products[products.product_id == input_product_id].ingredients_preprocessed.tolist()[0].split(", ")

# Remove the product being searched for
category_df = products[products['product_id'] != input_product_id][['product_id','product_name','ingredients_preprocessed']]

# Perform similarity search using Jaccard Similarity
jaccard_similarity_scores_sorted = similarity_search(category_df, token_list)

In [21]:
# Top 10 products most similar to input product using jaccard similarity
jaccard_similarity_scores_sorted[:10]

Unnamed: 0,product_name,similarity_score
0,The Renewal Oil,0.842105
1,The Moisturizing Matte Lotion,0.411765
2,Mini The Moisturizing Cool Gel Cream,0.373626
3,The Revitalizing Hydrating Serum,0.363636
4,The Moisturizing Cool Gel Cream,0.358696
5,The Hydrating Infused Emulsion,0.354167
6,The Moisturizing Soft Lotion,0.351852
7,The Moisturizing Soft Cream Moisturizer,0.344828
8,The Eye Concentrate Cream,0.295082
9,Mini The Treatment Lotion,0.284404


### TODO: Deal with products having multiple sizes