In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize

# Load the data
df = pd.read_csv("datasets/cosmetics.csv")

# Check the first five rows 
display(df.sample(5))

# Inspect the types of products
df['Label'].value_counts()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
593,Treatment,DRUNK ELEPHANT,NightBright™ Duo,28,4.2,"Water, Glycolic Acid, Butylene Glycol, Glyceri...",1,1,1,1,1
742,Treatment,PETER THOMAS ROTH,Un-Wrinkle® Turbo Face Serum,150,3.7,"Water, Glycerin, Rosa Centifolia Flower Water,...",0,0,0,0,0
1260,Eye cream,KIEHL'S SINCE 1851,Powerful-Strength Line-Reducing Eye-Brightenin...,44,3.1,Propylene Glycol Cyclopentasiloxane Ascorbic A...,1,1,1,1,0
488,Cleanser,MURAD,Renewing Cleansing Cream,39,4.5,"Water, Sodium Cocoyl Isethionate, Glyceryl Ste...",0,0,0,0,0
747,Treatment,CLINIQUE,Acne Solutions Clear Skin System Starter Kit,28,4.0,"Water , Glycerin , Butylene Glycol , Sodium Me...",0,0,0,0,0


Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: Label, dtype: int64

In [2]:
# Filter for moisturizers
moisturizers = df[df['Label']=='Moisturizer']

# Reset index
moisturizers = moisturizers.reset_index(drop=True)

In [3]:
# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# For loop for tokenization
for i in range(len(moisturizers)):    
    ingredients = moisturizers['Ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1
            
# Check the result 
print("The index for decyl oleate is", ingredient_idx['decyl oleate'])

The index for decyl oleate is 25


In [4]:
# Get the number of items and tokens 
M = len(moisturizers)
N = len(ingredient_idx)

# Initialize a matrix of zeros
A = np.zeros((M,N))

# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

# Make a document-term matrix
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i += 1
    
display(A.shape)

(298, 2920)

In [5]:
# Dimension reduction with t-SNE
model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(A)

# Make X, Y columns
moisturizers['X']=tsne_features[:,0]
moisturizers['Y']=tsne_features[:,1]



In [7]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

# Make a source and a scatter plot  
source = ColumnDataSource(data=moisturizers)
plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 500, height = 400)
plot.circle(x = 'X', 
    y = 'Y', 
    source = source, 
    size = 10, color = '#FF7373', alpha = .8)

# Create a HoverTool object
hover = HoverTool(tooltips = [('Item','@Name'),
                              ('Brand','@Brand'),
                              ('Price','$@Price'),
                              ('Rank','@Rank')])
plot.add_tools(hover)

# Plot the map
show(plot)