### Load imports

In [109]:
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
%matplotlib inline

### Load data

In [110]:
product_info = pd.read_csv("data/skincare.csv")
product_info.columns

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')

# Similarity based on ingredients

In [111]:
# List of product ingredients
product_ingredients = product_info[["product_name","ingredients"]][product_info.ingredients.notnull()]

In [112]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Vectorize the ingredients
tfidf_matrix_ingredients = vectorizer.fit_transform(product_ingredients["ingredients"])

In [113]:
# Compute the pairwise cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix_ingredients, tfidf_matrix_ingredients)

In [114]:
# Create dataframe for similarity matrix
scores_df = pd.DataFrame(similarity_matrix, index=product_ingredients.product_name, columns=product_ingredients.product_name)
scores_df.head()

product_name,GENIUS Sleeping Collagen Moisturizer,GENIUS Liquid Collagen Serum,GENIUS Liquid Collagen Lip Treatment,SUBLIME DEFENSE Ultra Lightweight UV Defense Fluid SPF 50,GENIUS Ultimate Anti-Aging Cream,10 Day Results Kit,GENIUS Ultimate Anti-Aging Vitamin C+ Serum,Algae Niacinamide Moisture Veil,GENIUS Liquid Skin Resurfacing 2% BHA Toner,GENIUS Collagen Calming Relief,...,Superfood Hydrate + Firm Peptide Eye Cream,Retinal + Niacinamide Youth Serum,Kombucha + 10% AHA Liquid Exfoliant with Lactic Acid and Glycolic Acid,The Youth System,Peptides + C Energy Eye Concentrate with Vitamin C and Caffeine,The Youth Vault: 13-Piece Vegan Skincare + Apparel Set,"Youth Stacks: Plump It Up for Dry, Dehydrated Skin",Mini Mandelic Acid + Superfood Unity Exfoliant,Youth Stacks: Daily Skin Health Your Way for Pores and Oiliness,Youth Stacks: Brighter Tomorrow Duo for Dullness
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GENIUS Sleeping Collagen Moisturizer,1.0,0.466956,0.422897,0.187343,0.341831,0.456401,0.306938,0.202731,0.37934,0.39095,...,0.123012,0.127353,0.136582,0.206056,0.14246,0.26282,0.198685,0.123012,0.138364,0.141685
GENIUS Liquid Collagen Serum,0.466956,1.0,0.672641,0.134687,0.326883,0.627961,0.292204,0.151409,0.35295,0.368085,...,0.116179,0.095491,0.116579,0.11319,0.147768,0.172796,0.172835,0.116179,0.088225,0.067439
GENIUS Liquid Collagen Lip Treatment,0.422897,0.672641,1.0,0.120055,0.280782,0.483138,0.193633,0.122308,0.340023,0.34399,...,0.120828,0.092435,0.09676,0.147106,0.099634,0.170983,0.12532,0.120828,0.123875,0.078559
SUBLIME DEFENSE Ultra Lightweight UV Defense Fluid SPF 50,0.187343,0.134687,0.120055,1.0,0.180921,0.244631,0.145302,0.189668,0.175368,0.17756,...,0.152169,0.05779,0.044165,0.138715,0.090978,0.159953,0.073871,0.152169,0.147473,0.069067
GENIUS Ultimate Anti-Aging Cream,0.341831,0.326883,0.280782,0.180921,1.0,0.672539,0.272603,0.158418,0.244975,0.337573,...,0.095871,0.140748,0.104215,0.137768,0.083117,0.192667,0.133209,0.095871,0.101885,0.086994


# Similar products based on ingredient similarity

In [115]:
rand_int = np.random.randint(0, product_info.shape[0]-1)

input_product = product_info.product_name.values[rand_int]
input_product

'Face Mask'

In [129]:
products = scores_df.columns
scores = scores_df[scores_df.columns==input_product].values

scores_dict = {"Product": products.values, "Score": scores[0]}

In [131]:
df_dict = pd.DataFrame.from_dict(scores_dict)

In [157]:
top_10 = df_dict.sort_values("Score", ascending=False).reset_index().drop(columns="index")[1:11]

In [158]:
top_10

Unnamed: 0,Product,Score
1,Hyaluronic Serum,0.535594
2,Mini Hyaluronic Serum,0.535594
3,Aloe Vera Gel,0.476313
4,The SturmGlow Kit,0.465394
5,Darker Skin Tones Hyaluronic Serum,0.439171
6,Mini Super Anti-Aging Serum,0.431934
7,Super Anti-Aging Serum,0.431934
8,Anti-Pollution Drops,0.425661
9,Eye Cream,0.417206
10,Super Anti-Aging Night Cream,0.406964
