In [5]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE 
from scipy.spatial.distance import cdist

# Load the data
df = pd.read_csv(r"../data/cosmetic_preprocess.csv");

# Check the first five rows 
display(df.head())

# Inspect the types of products
display(df['Label'].value_counts())

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive
0,moisturizer,The Dewy Skin Cream Plumping & Hydrating Moist...,Tatcha,69,4.0,"Aqua/Water/Eau, Saccharomyces/Camellia Sinensi...",0,1,0,1,0,0
1,moisturizer,Revealer Skin-Improving Foundation SPF25 with ...,Kosas,42,4.0,"Zinc Oxide (7.5%), Water, Ethylhexyl Olivate, ...",1,1,0,1,1,0
2,moisturizer,Plum Plump Hyaluronic Acid Moisturizer,Glow Recipe,39,4.5,"Water/Aqua/Eau, Propanediol, Glycerin, C13-15 ...",1,1,0,1,1,0
3,moisturizer,The Water Cream Oil-Free Pore Minimizing Moist...,Tatcha,69,4.0,"Water, Saccharomyces/Camellia Sinensis Leaf/Cl...",1,1,0,1,1,0
4,moisturizer,Barrier+ Triple Lipid-Peptide Face Cream,Skinfix,52,4.5,"Water/Eau/Aqua, Caprylic/Capric Triglyceride, ...",0,1,0,1,1,0


moisturizer       546
face_treatment    440
cleanser          336
face_mask         176
eye_treatment     175
sunscreen         143
Name: Label, dtype: int64

In [7]:
df.shape

(1816, 12)

In [10]:
# Filter for moisturizers
moisturizers = df[df['Label'] == 'moisturizer']

# Filter for Cleanser
cleansers = df[df['Label'] == 'cleanser']

# Filter for dry skin as well
moisturizers_subset = moisturizers[moisturizers['Oil'] == 1]

# Reset index
moisturizers_subset = moisturizers_subset.reset_index(drop=True)

In [11]:
cleansers.head()

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive
546,cleanser,Superfood Antioxidant Cleanser,Youth To The People,36,4.0,"Water/Aqua/Eau, Cocamidopropyl Hydroxysultaine...",1,1,0,1,1,0
547,cleanser,Green Clean Makeup Removing Cleansing Balm,Farmacy,34,4.5,"Cetyl Ethylhexanoate, Caprylic/Capric Triglyce...",1,1,0,1,1,0
548,cleanser,The Rice Wash Skin-Softening Cleanser,Tatcha,36,4.5,"Aqua/Water/Eau, Microcrystalline Cellulose, Pr...",1,1,0,1,0,0
549,cleanser,The Deep Cleanse Gentle Exfoliating Cleanser,Tatcha,39,4.0,"Water, Sodium Cocoyl Glutamate, Propanediol, G...",1,1,0,1,1,0
550,cleanser,Deep Sweep 2% BHA Pore Cleaning Toner with Mor...,Farmacy,28,4.5,"Water/Aqua/Eau, Arginine, Salicylic Acid, Lact...",1,0,0,0,1,0


In [12]:
moisturizers.head()

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive
0,moisturizer,The Dewy Skin Cream Plumping & Hydrating Moist...,Tatcha,69,4.0,"Aqua/Water/Eau, Saccharomyces/Camellia Sinensi...",0,1,0,1,0,0
1,moisturizer,Revealer Skin-Improving Foundation SPF25 with ...,Kosas,42,4.0,"Zinc Oxide (7.5%), Water, Ethylhexyl Olivate, ...",1,1,0,1,1,0
2,moisturizer,Plum Plump Hyaluronic Acid Moisturizer,Glow Recipe,39,4.5,"Water/Aqua/Eau, Propanediol, Glycerin, C13-15 ...",1,1,0,1,1,0
3,moisturizer,The Water Cream Oil-Free Pore Minimizing Moist...,Tatcha,69,4.0,"Water, Saccharomyces/Camellia Sinensis Leaf/Cl...",1,1,0,1,1,0
4,moisturizer,Barrier+ Triple Lipid-Peptide Face Cream,Skinfix,52,4.5,"Water/Eau/Aqua, Caprylic/Capric Triglyceride, ...",0,1,0,1,1,0


# Tokenizing the ingredients

To get to our end goal of comparing ingredients in each product, we first need to do some preprocessing tasks and bookkeeping of the actual words in each product's ingredients list. The first step will be tokenizing the list of ingredients in Ingredients column. After splitting them into tokens, we'll make a binary bag of words. Then we will create a dictionary with the tokens, ingredient_idx, which will have the following format:

{ "ingredient": index value, … }

In [15]:
# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# For loop for tokenization
for i in range(len(moisturizers_subset)):    
    ingredients = moisturizers_subset['ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1
            
# Check the result 
print("The index for decyl oleate is", ingredient_idx['decyl oleate'])

The index for decyl oleate is 634


from IPython.display import Image
from PIL import Image


path="./data/image_1.jpg"
display(Image.open(path))

# Initializing a document-term matrix (DTM)

The next step is making a document-term matrix (DTM). Here each cosmetic product will correspond to a document, and each chemical composition will correspond to a term. This means we can think of the matrix as a “cosmetic-ingredient” matrix.

To create this matrix, we'll first make an empty matrix filled with zeros. The length of the matrix is the total number of cosmetic products in the data. The width of the matrix is the total number of ingredients. After initializing this empty matrix, we'll fill it in the following tasks.

In [17]:
# Get the number of items and tokens 
M = len(moisturizers_subset)
N = len(ingredient_idx)

# Initialize a matrix of zeros
A = np.zeros((M,N))

In [18]:
A.shape

(452, 3155)

# Creating a counter function

Before we can fill the matrix, let's create a function to count the tokens (i.e., an ingredients list) for each row. Our end goal is to fill the matrix with 1 or 0: if an ingredient is in a cosmetic, the value is 1. If not, it remains 0. The name of this function, oh_encoder, will become clear next.

In [19]:
# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

# The Cosmetic-Ingredient matrix!

In [21]:
# Make a document-term matrix
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i +=1

In [22]:
A.shape

(452, 3155)

In [23]:
A

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# Dimension reduction with t-SNE

In [24]:
# Dimension reduction with t-SNE
model = TSNE(n_components = 2, learning_rate = 200, random_state = 42)
tsne_features = model.fit_transform(A)

# Make X, Y columns 
moisturizers_subset['X'] = tsne_features[:, 0]
moisturizers_subset['Y'] = tsne_features[:, 1]

In [25]:
moisturizers_subset.head()

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive,X,Y
0,moisturizer,Revealer Skin-Improving Foundation SPF25 with ...,Kosas,42,4.0,"Zinc Oxide (7.5%), Water, Ethylhexyl Olivate, ...",1,1,0,1,1,0,39.946255,15.687675
1,moisturizer,Plum Plump Hyaluronic Acid Moisturizer,Glow Recipe,39,4.5,"Water/Aqua/Eau, Propanediol, Glycerin, C13-15 ...",1,1,0,1,1,0,-10.354266,-33.974831
2,moisturizer,The Water Cream Oil-Free Pore Minimizing Moist...,Tatcha,69,4.0,"Water, Saccharomyces/Camellia Sinensis Leaf/Cl...",1,1,0,1,1,0,-25.721682,-17.232061
3,moisturizer,Barrier+ Triple Lipid-Peptide Face Cream,Skinfix,52,4.5,"Water/Eau/Aqua, Caprylic/Capric Triglyceride, ...",0,1,0,1,1,0,33.086311,-54.393497
4,moisturizer,Vitamin Enriched Face Base Priming Moisturizer,Bobbi Brown,64,4.5,"Water, Cyclopentasiloxane, Bis-Diglyceryl Poly...",1,0,0,1,1,0,-21.984222,29.813787


# Find similar products

In [28]:
target = moisturizers_subset[moisturizers_subset['Name'] == 'Plum Plump Hyaluronic Acid Moisturizer']
target

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive,X,Y
1,moisturizer,Plum Plump Hyaluronic Acid Moisturizer,Glow Recipe,39,4.5,"Water/Aqua/Eau, Propanediol, Glycerin, C13-15 ...",1,1,0,1,1,0,-10.354266,-33.974831


In [29]:
x  = target['X'].values[0]
y = target['Y'].values[0]
print(x, y)

-10.354266 -33.97483


In [30]:
df1 = pd.DataFrame()
df1['point'] = [(x, y) for x,y in zip(moisturizers_subset['X'], moisturizers_subset['Y'])]

df1.head()

Unnamed: 0,point
0,"(39.94625473022461, 15.687675476074219)"
1,"(-10.354266166687012, -33.974830627441406)"
2,"(-25.721681594848633, -17.2320613861084)"
3,"(33.08631134033203, -54.393497467041016)"
4,"(-21.984222412109375, 29.81378746032715)"


In [31]:
point = np.array([[x,y]])

point.shape

(1, 2)

In [32]:
cdist(point, np.array([[-9.456123352050781, 16.348276138305664]]), metric='euclidean')

array([[50.33112094]])

In [33]:
moisturizers_subset['distance'] = [cdist(np.array([[x,y]]), np.array([product]), metric='euclidean') for product in df1['point']]

In [34]:
moisturizers_subset.head()

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive,X,Y,distance
0,moisturizer,Revealer Skin-Improving Foundation SPF25 with ...,Kosas,42,4.0,"Zinc Oxide (7.5%), Water, Ethylhexyl Olivate, ...",1,1,0,1,1,0,39.946255,15.687675,[[70.68597396218269]]
1,moisturizer,Plum Plump Hyaluronic Acid Moisturizer,Glow Recipe,39,4.5,"Water/Aqua/Eau, Propanediol, Glycerin, C13-15 ...",1,1,0,1,1,0,-10.354266,-33.974831,[[0.0]]
2,moisturizer,The Water Cream Oil-Free Pore Minimizing Moist...,Tatcha,69,4.0,"Water, Saccharomyces/Camellia Sinensis Leaf/Cl...",1,1,0,1,1,0,-25.721682,-17.232061,[[22.72614746960484]]
3,moisturizer,Barrier+ Triple Lipid-Peptide Face Cream,Skinfix,52,4.5,"Water/Eau/Aqua, Caprylic/Capric Triglyceride, ...",0,1,0,1,1,0,33.086311,-54.393497,[[48.00005968381595]]
4,moisturizer,Vitamin Enriched Face Base Priming Moisturizer,Bobbi Brown,64,4.5,"Water, Cyclopentasiloxane, Bis-Diglyceryl Poly...",1,0,0,1,1,0,-21.984222,29.813787,[[64.84013941855487]]


In [35]:
# arrange by descending order
top_matches = moisturizers_subset.sort_values(by=['distance'])
top_matches.head(5)

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive,X,Y,distance
1,moisturizer,Plum Plump Hyaluronic Acid Moisturizer,Glow Recipe,39,4.5,"Water/Aqua/Eau, Propanediol, Glycerin, C13-15 ...",1,1,0,1,1,0,-10.354266,-33.974831,[[0.0]]
20,moisturizer,10% Niacinamide Night Mask,Farmacy,42,4.5,"Water/Aqua/Eau, Propanediol, Niacinamide, C15-...",1,1,0,1,1,0,-6.816634,-33.378933,[[3.5874695118612188]]
187,moisturizer,Dry Erase® Ultra-Calming Face Cream,Jack Black,38,4.5,"Water, Aloe Barbadensis Leaf Juice, Caprylic/C...",1,1,0,1,1,0,-9.108423,-38.425537,[[4.621786754455821]]
134,moisturizer,SUBLIME DEFENSE Ultra Lightweight UV Defense F...,Algenist,28,4.5,"Octinoxate 7.5%, Titanium Dioxide 2%, Zinc Oxi...",1,1,0,1,1,0,-9.654286,-29.272757,[[4.753890204404537]]
437,moisturizer,The Ultimate Hydrating Vitamin C Facial Moistu...,BeautyBio,75,4.0,"Water, Carthamus Tinctorius (Safflower) Seed O...",1,1,0,1,1,0,-15.132396,-31.816231,[[5.2430978999332565]]


In [37]:
top_matches = top_matches[['Label','Name','brand', 'price', 'ingredients', 'distance']]
top_matches = top_matches.reset_index()
top_matches = top_matches.drop(top_matches.index[0])
top_matches.head()

Unnamed: 0,index,Label,Name,brand,price,ingredients,distance
1,20,moisturizer,10% Niacinamide Night Mask,Farmacy,42,"Water/Aqua/Eau, Propanediol, Niacinamide, C15-...",[[3.5874695118612188]]
2,187,moisturizer,Dry Erase® Ultra-Calming Face Cream,Jack Black,38,"Water, Aloe Barbadensis Leaf Juice, Caprylic/C...",[[4.621786754455821]]
3,134,moisturizer,SUBLIME DEFENSE Ultra Lightweight UV Defense F...,Algenist,28,"Octinoxate 7.5%, Titanium Dioxide 2%, Zinc Oxi...",[[4.753890204404537]]
4,437,moisturizer,The Ultimate Hydrating Vitamin C Facial Moistu...,BeautyBio,75,"Water, Carthamus Tinctorius (Safflower) Seed O...",[[5.2430978999332565]]
5,204,moisturizer,Acne-Clear Oil-Free Matte Moisturizer,Peter Thomas Roth,38,"Water/Aqua/Eau, Dimethicone, Propanediol, Capr...",[[6.58644607593422]]


In [38]:
cosmetic_1 = moisturizers_subset[moisturizers_subset['Name'] == "Acne-Clear Oil-Free Matte Moisturizer"]
cosmetic_2 = moisturizers_subset[moisturizers_subset['Name'] == "Moisture Surge Hydrating Supercharged Concentrate"]

In [40]:
display(cosmetic_1)
print(cosmetic_1.ingredients.values)
display(cosmetic_2)
print(cosmetic_2.ingredients.values)

Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive,X,Y,distance
204,moisturizer,Acne-Clear Oil-Free Matte Moisturizer,Peter Thomas Roth,38,4.0,"Water/Aqua/Eau, Dimethicone, Propanediol, Capr...",1,1,0,1,1,0,-16.54001,-36.237095,[[6.58644607593422]]


['Water/Aqua/Eau, Dimethicone, Propanediol, Caprylic/Capric Triglyceride, Glycerin, Jojoba Esters, Isodecyl Neopentanoate, Cetearyl Alcohol, Panthenol, Polysilicone-11, Cordyceps Sinensis Extract, Trametes Versicolor (Mushroom) Extract, Helianthus Annuus (Sunflower) Seed Wax, Sodium Hydroxide, Hydroxyethyl Acrylate/Sodium Acryloyl Dimethyl Taurate Copolymer, Polyacrylate Crosspolymer-6, Ceteareth-20, Isohexadecane, Tocopheryl Acetate, Hydrogenated Lecithin, Aloe Barbadensis Leaf Juice, Butylene Glycol, Allantoin, Disodium EDTA, Sodium Hyaluronate, Acacia Decurrens Flower Wax, Polyglycerin-3, Phenoxyethanol, Polysorbate 60, Sodium PCA, Sodium Citrate, Sorbitan Isostearate, Maclura Cochinchinensis Leaf Prenylflavonoids, t-Butyl Alcohol, Glycosaminoglycans, Potassium Sorbate, Sodium Benzoate, Ethylhexylglycerin, Citric Acid, Trisodium EDTA.']


Unnamed: 0,Label,Name,brand,price,rank,ingredients,Combination,Dry,Mature,Normal,Oil,Sensitive,X,Y,distance
228,moisturizer,Moisture Surge Hydrating Supercharged Concentrate,CLINIQUE,41,4.0,"Water, Glycerin, Butylene Glycol, Phenyl Trime...",1,1,0,1,1,0,-5.480185,38.250057,[[72.38916418601865]]


['Water, Glycerin, Butylene Glycol, Phenyl Trimethicone, Propanediol, Sucrose, Hydroxyethyl Urea, Alteromonas Ferment Extract, Hydrolyzed Rice Extract, Acetyl Glucosamine, Trehalose, Algae Extract, Caffeine, Cholesterol, Aloe Barbadensis Leaf Water, Sodium Hyaluronate, Pentaerythrityl Tetraethylhexanoate, Dextrin Palmitate, Acrylates/C10-30 Alkyl Acrylate Crosspolymer, Sorbitol, Sodium Polyaspartate, Tocopheryl Acetate, Carbomer, Dehydroxanthan Gum, Citric Acid, Tetrahexyldecyl Ascorbate, Benzophenone-4, Pentaerythrityl Tetra-Di-T-Butyl Hydroxyhydrocinnamate, Sodium Hydroxide, Sodium Citrate, Disodium Edta, Bht, Chlorphenesin, Phenoxyethanol, Red 4 (Ci 14700), Yellow 5 (Ci 19140).']


In [41]:
c1 = cosmetic_1.ingredients.values
c2 = cosmetic_2.ingredients.values

In [42]:
# make list of ingredients
c1_list = c1[0].split(",")
c2_list = c2[0].split(",")

# strip spaces
c1_list = [x.strip(' ') for x in c1_list]
c2_list = [x.strip(' ') for x in c2_list]

In [43]:
# turn lists into sets for comparison
c1_set = set(c1_list)
c2_set = set(c2_list)

In [44]:
# get same ingredients
same_ingredients = c2_set.intersection(c1_set)
print(same_ingredients)

{'Sodium Hydroxide', 'Glycerin', 'Sodium Citrate', 'Propanediol', 'Phenoxyethanol', 'Sodium Hyaluronate', 'Tocopheryl Acetate', 'Citric Acid', 'Butylene Glycol'}


In [46]:
top_matches['Ingredients in common'] = [c1_set.intersection( set([x.strip(' ')for x in product.split(",")]) ) for product in top_matches['ngredients']]
top_matches.head(5)

Unnamed: 0,index,Label,Name,brand,price,ingredients,distance,Ingredients in common
1,20,moisturizer,10% Niacinamide Night Mask,Farmacy,42,"Water/Aqua/Eau, Propanediol, Niacinamide, C15-...",[[3.5874695118612188]],"{Water/Aqua/Eau, Glycerin, Propanediol, Cetear..."
2,187,moisturizer,Dry Erase® Ultra-Calming Face Cream,Jack Black,38,"Water, Aloe Barbadensis Leaf Juice, Caprylic/C...",[[4.621786754455821]],"{Glycerin, Ethylhexylglycerin, Cetearyl Alcoho..."
3,134,moisturizer,SUBLIME DEFENSE Ultra Lightweight UV Defense F...,Algenist,28,"Octinoxate 7.5%, Titanium Dioxide 2%, Zinc Oxi...",[[4.753890204404537]],"{Phenoxyethanol, Ethylhexylglycerin, Glycerin,..."
4,437,moisturizer,The Ultimate Hydrating Vitamin C Facial Moistu...,BeautyBio,75,"Water, Carthamus Tinctorius (Safflower) Seed O...",[[5.2430978999332565]],"{Glycerin, Polysorbate 60, Potassium Sorbate, ..."
5,204,moisturizer,Acne-Clear Oil-Free Matte Moisturizer,Peter Thomas Roth,38,"Water/Aqua/Eau, Dimethicone, Propanediol, Capr...",[[6.58644607593422]],"{Glycosaminoglycans, Sodium Benzoate, Water/Aq..."
