In [37]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize

# Load the data
df = pd.read_csv("datasets/cosmetics.csv")

# Check the first five rows 
display(df.sample(5))

# Inspect the types of products
df['Label'].value_counts()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
1414,Sun protect,ESTÉE LAUDER,DayWear Multi-Protection Anti-Oxidant Sheer Ti...,54,4.6,Daywear Plus Mlti/Prt Moist Spf 15 Division: E...,0,0,0,0,0
335,Cleanser,TATA HARPER,Regenerating Cleanser,42,4.2,*Ingredients from organic farming. **Clinical ...,1,1,1,1,0
647,Treatment,PETER THOMAS ROTH,Un-Wrinkle Peel Pads,45,4.6,"Water, Ethoxydiglycol, Glycolic Acid, Rosa Cen...",0,0,0,0,0
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
978,Face Mask,PERRICONE MD,Cocoa Moisture Mask,69,4.3,"Water, Cetearyl Alcohol, Caprylic/Capric Trigl...",1,1,1,0,1


Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: Label, dtype: int64

In [38]:
# Filter for sunscreens
sunscreens = df[df['Label']=='Sun protect']

# Reset index
sunscreens = sunscreens.reset_index(drop=True)

In [39]:
# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# For loop for tokenization
for i in range(len(sunscreens)):    
    ingredients = sunscreens['Ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1

In [40]:
# Get the number of items and tokens 
M = len(sunscreens)
N = len(ingredient_idx)

# Initialize a matrix of zeros
A = np.zeros((M,N))

# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

# Make a document-term matrix
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i += 1

In [41]:
display(A.shape)

(170, 1646)

In [42]:
# Dimension reduction with t-SNE
model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(A)

# Make X, Y columns
sunscreens['X']=tsne_features[:,0]
sunscreens['Y']=tsne_features[:,1]



In [43]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

# Make a source and a scatter plot  
source = ColumnDataSource(data=sunscreens)
plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 500, height = 400)
plot.circle(x = 'X', 
    y = 'Y', 
    source = source, 
    size = 10, color = '#F45F0F', alpha = .8)

# Create a HoverTool object
hover = HoverTool(tooltips = [('Item','@Name'),
                              ('Brand','@Brand'),
                              ('Price','$@Price'),
                              ('Rank','@Rank')])
plot.add_tools(hover)

# Plot the map
show(plot)