In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

# Load the data
df = pd.read_csv("cosmetics.csv")

# Check the first five rows 
display(df.sample(5))

# Inspect the types of products
df.Label.value_counts()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
1378,Sun protect,LANCER,Sheer Fluid Sun Shield Broad Spectrum SPF 30 S...,55,4.8,"Cyclopentasiloxane, Water, Alcohol, Denat., Gl...",1,1,1,1,1
960,Face Mask,DR. BRANDT SKINCARE,Oxygen Facial Flash Recovery Mask,70,4.5,"Water, Glycerin, Kaolin, Methyl Perfluorobutyl...",1,1,1,1,0
94,Moisturizer,LANEIGE,Water Bank Essence,36,4.7,"Water, Glycerin, Butylene Glycol, Glycereth-26...",1,1,1,1,1
1084,Face Mask,NATURALLY SERIOUS,Mask-Imum Revival Hydra-Plumping Mask,42,4.5,"Water, Kaolin (absorbent - clay), C9-12 Alkane...",1,1,1,1,1
1222,Eye cream,ANTHONY,High Performance Continuous Moisture Eye Cream,38,4.3,"Water, Glycerin, Caprylic/Capric Triglyceride,...",0,0,0,0,0


Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: Label, dtype: int64

In [3]:
# Filter for moisturizers
moisturizers = df[df['Label'] == "Moisturizer"]

# Filter for dry skin as well
moisturizers_dry = moisturizers[moisturizers["Dry"] == 1]

# Reset index
moisturizers_dry = moisturizers_dry.reset_index(drop=True)

In [4]:
# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# For loop for tokenization
for i in range(len(moisturizers_dry)):    
    ingredients = moisturizers_dry['Ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient ] = idx
            idx += 1
            
# Check the result 
print("The index for decyl oleate is", ingredient_idx['decyl oleate'])

The index for decyl oleate is 25


In [5]:
print(ingredient_idx)

{'algae (seaweed) extract': 0, 'mineral oil': 1, 'petrolatum': 2, 'glycerin': 3, 'isohexadecane': 4, 'microcrystalline wax': 5, 'lanolin alcohol': 6, 'citrus aurantifolia (lime) extract': 7, 'sesamum indicum (sesame) seed oil': 8, 'eucalyptus globulus (eucalyptus) leaf oil': 9, 'sesamum indicum (sesame) seed powder': 10, 'medicago sativa (alfalfa) seed powder': 11, 'helianthus annuus (sunflower) seedcake': 12, 'prunus amygdalus dulcis (sweet almond) seed meal': 13, 'sodium gluconate': 14, 'copper gluconate': 15, 'calcium gluconate': 16, 'magnesium gluconate': 17, 'zinc gluconate': 18, 'magnesium sulfate': 19, 'paraffin': 20, 'tocopheryl succinate': 21, 'niacin': 22, 'water': 23, 'beta-carotene': 24, 'decyl oleate': 25, 'aluminum distearate': 26, 'octyldodecanol': 27, 'citric acid': 28, 'cyanocobalamin': 29, 'magnesium stearate': 30, 'panthenol': 31, 'limonene': 32, 'geraniol': 33, 'linalool': 34, 'hydroxycitronellal': 35, 'citronellol': 36, 'benzyl salicylate': 37, 'citral': 38, 'sodiu

In [6]:
# Get the number of items and tokens 
M = moisturizers_dry.shape[0]
N = len(ingredient_idx)

# Initialize a matrix of zeros
A = np.zeros((M, N))

In [7]:
A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
 #Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

In [15]:
x.values()

NameError: name 'x' is not defined

In [16]:
# Dimension reduction with t-SNE
model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(A)

# Make X, Y columns 
moisturizers_dry['X'] = tsne_features[:, 0]
moisturizers_dry['Y'] = tsne_features[:, 1]

In [17]:
print(moisturizers_dry['X'])

0     -0.858072
1     -0.857580
2     -0.858882
3     -0.858637
4     -0.858076
5     -0.858637
6     -0.858637
7     -0.858072
8     -0.857760
9     -0.858637
10    -0.858637
11    -0.858637
12    -0.858072
13    -0.858637
14    -0.858852
15    -0.857969
16    -0.858072
17    -0.858687
18    -0.858687
19    -0.858072
20    -0.858637
21    -0.858765
22    -0.858637
23    -0.857973
24    -0.858156
25    -0.858765
26    -0.858072
27    -0.857580
28    -0.857697
29    -0.858652
         ...   
160   -0.463725
161   -1.158991
162   -1.929271
163    0.271853
164    0.004437
165   -2.030287
166   -0.562374
167   -0.508279
168   -1.158990
169   -1.172698
170   -1.719077
171   -0.592200
172   -0.225108
173   -0.489116
174   -2.030360
175    0.348336
176   -1.536973
177   -1.943863
178   -0.524532
179   -1.537002
180   -0.264419
181   -0.556899
182   -2.110637
183    0.312338
184    0.349914
185   -1.152860
186   -1.847097
187   -2.102032
188    0.389818
189   -0.858637
Name: X, Length: 190, dt

In [19]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

# Make a source and a scatter plot  
source = ColumnDataSource(moisturizers_dry)
plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 500, height = 400)
plot.circle(x = 'X', 
    y = 'Y', 
    source = source, 
    size = 10, color = '#FF7373', alpha = .8)

In [20]:
# Create a HoverTool object
hover = HoverTool(tooltips = [('Item', '@Name'),
                              ('Brand', '@Brand'),
                              ('Price', '$@Price'),
                              ('Rank', '@Rank')])
plot.add_tools(hover)

In [21]:
show(plot)

In [22]:
# Print the ingredients of two similar cosmetics
cosmetic_1 = moisturizers_dry[moisturizers_dry['Name'] == "Color Control Cushion Compact Broad Spectrum SPF 50+"]
cosmetic_2 = moisturizers_dry[moisturizers_dry['Name'] == "BB Cushion Hydra Radiance SPF 50"]

# Display each item's data and ingredients
display(cosmetic_1)
print(cosmetic_1.Ingredients.values)
display(cosmetic_2)
print(cosmetic_2.Ingredients.values)

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,X,Y
45,Moisturizer,AMOREPACIFIC,Color Control Cushion Compact Broad Spectrum S...,60,4.0,"Phyllostachis Bambusoides Juice, Cyclopentasil...",1,1,1,1,1,-0.858637,1.306772


['Phyllostachis Bambusoides Juice, Cyclopentasiloxane, Cyclohexasiloxane, Peg-10 Dimethicone, Phenyl Trimethicone, Butylene Glycol, Butylene Glycol Dicaprylate/Dicaprate, Alcohol, Arbutin, Lauryl Peg-9 Polydimethylsiloxyethyl Dimethicone, Acrylates/Ethylhexyl Acrylate/Dimethicone Methacrylate Copolymer, Polyhydroxystearic Acid, Sodium Chloride, Polymethyl Methacrylate, Aluminium Hydroxide, Stearic Acid, Disteardimonium Hectorite, Triethoxycaprylylsilane, Ethylhexyl Palmitate, Lecithin, Isostearic Acid, Isopropyl Palmitate, Phenoxyethanol, Polyglyceryl-3 Polyricinoleate, Acrylates/Stearyl Acrylate/Dimethicone Methacrylate Copolymer, Dimethicone, Disodium Edta, Trimethylsiloxysilicate, Ethylhexyglycerin, Dimethicone/Vinyl Dimethicone Crosspolymer, Water, Silica, Camellia Japonica Seed Oil, Camillia Sinensis Leaf Extract, Caprylyl Glycol, 1,2-Hexanediol, Fragrance, Titanium Dioxide, Iron Oxides (Ci 77492, Ci 77491, Ci77499).']


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,X,Y
55,Moisturizer,LANEIGE,BB Cushion Hydra Radiance SPF 50,38,4.3,"Water, Cyclopentasiloxane, Zinc Oxide (CI 7794...",1,1,1,1,1,-0.857786,1.306316


['Water, Cyclopentasiloxane, Zinc Oxide (CI 77947), Ethylhexyl Methoxycinnamate, PEG-10 Dimethicone, Cyclohexasiloxane, Phenyl Trimethicone, Iron Oxides (CI 77492), Butylene Glycol Dicaprylate/Dicaprate, Niacinamide, Lauryl PEG-9 Polydimethylsiloxyethyl Dimethicone, Acrylates/Ethylhexyl Acrylate/Dimethicone Methacrylate Copolymer, Titanium Dioxide (CI 77891 , Iron Oxides (CI 77491), Butylene Glycol, Sodium Chloride, Iron Oxides (CI 77499), Aluminum Hydroxide, HDI/Trimethylol Hexyllactone Crosspolymer, Stearic Acid, Methyl Methacrylate Crosspolymer, Triethoxycaprylylsilane, Phenoxyethanol, Fragrance, Disteardimonium Hectorite, Caprylyl Glycol, Yeast Extract, Acrylates/Stearyl Acrylate/Dimethicone Methacrylate Copolymer, Dimethicone, Trimethylsiloxysilicate, Polysorbate 80, Disodium EDTA, Hydrogenated Lecithin, Dimethicone/Vinyl Dimethicone Crosspolymer, Mica (CI 77019), Silica, 1,2-Hexanediol, Polypropylsilsesquioxane, Chenopodium Quinoa Seed Extract, Magnesium Sulfate, Calcium Chloride