In [13]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE


In [14]:
# Task 1: Understanding the Data

# We begin by loading the dataset and understanding its structure.  
# The main goal is to recommend products based on their **ingredients** — especially for people with dry or sensitive skin.

# We check how many products are in each category (like moisturizer, cleanser, etc.) by using `value_counts()` on the "Label" column.  
# This gives us a good idea of what kind of products are most common in the dataset.


In [15]:
import os
print(os.getcwd())
print(os.listdir())




e:\DA\jup\.venv
['.gitignore', 'Include', 'Lib', 'pip.pyz', 'pyvenv.cfg', 'Scripts', 'share', 'task1.ipynb']


In [16]:
%pip install pandas



df = pd.read_csv("e:\DA\iship\cosmetics.csv")
df.sample(5)  # Shows a random 5-row sample


  df = pd.read_csv("e:\DA\iship\cosmetics.csv")


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
1073,Face Mask,CLARINS,SOS Hydra Refreshing Hydration Mask,34,3.8,"Water, Glycerin, Butylene Glycol, Stearic Acid...",1,1,1,0,0
746,Treatment,CLINIQUE,Repairwear Laser Focus,50,4.1,"Water , Dimethicone , Butylene Glycol , Methyl...",0,0,0,0,0
392,Cleanser,CLINIQUE,Rinse-Off Foaming Cleanser,21,4.6,"Water , Potassium Myristate , Glycerin , Potas...",0,0,0,0,0
702,Treatment,KORRES,"Black Pine 3D Sculpting, Firming & Lifting Fac...",75,4.8,"Water, Pentylene Glycol, Sinorhizobium Melilot...",1,1,1,1,1
283,Moisturizer,DIOR,Capture Youth Age-Delay Advanced Crème,95,3.9,Visit the Dior boutique,1,1,1,1,1


In [17]:
print(df.columns)
df["Label"].value_counts()  # Count occurrences of each label
# df['Cleanser'].value_counts()
# df['Face Mask'].value_counts()
# df['Sun Protect'].value_counts()



Index(['Label', 'Brand', 'Name', 'Price', 'Rank', 'Ingredients', 'Combination',
       'Dry', 'Normal', 'Oily', 'Sensitive'],
      dtype='object')


Label
Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: count, dtype: int64

In [18]:
#  Task 2: Focusing on One Product Category and Skin Type

# We focus only on **moisturizers for dry skin**.  
# To do this, we filter our dataset in two steps:
# 1. Select only rows where the product type is "moisturizer"
# 2. Further filter for products that list "dry" in their skin type

# We also reset the index to clean up the DataFrame.
 


# task 2
moisturizers = df[df['Label'] == 'Moisturizer']
moisturizers.head()

moisturizers_dry = moisturizers[moisturizers['Dry'] == 1]
moisturizers_dry.head()

moisturizers_dry = moisturizers_dry.reset_index(drop=True)
moisturizers_dry.head(10)  # Display the first 10 rows of moisturizers for dry skin


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1
5,Moisturizer,DRUNK ELEPHANT,Lala Retro™ Whipped Cream,60,4.2,"Water, Glycerin, Caprylic/ Capric Triglyceride...",1,1,1,1,0
6,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0
7,Moisturizer,KIEHL'S SINCE 1851,Ultra Facial Cream,29,4.4,"Water, Glycerin, Cyclohexasiloxane, Squalane, ...",1,1,1,1,1
8,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,Caprylic/Capric Triglyceride Dicaprylyl Carbon...,1,1,1,1,1
9,Moisturizer,SUNDAY RILEY,Luna Sleeping Night Oil,105,4.1,"Persea Gratissima (Extra Virgin, Cold Pressed ...",1,1,1,1,1


In [19]:
# Task 3: Tokenizing the Ingredients

# Each product lists its ingredients as a long string, separated by commas.  
# In this step, we split (tokenize) those strings into individual ingredients and convert them to lowercase.

# We create a **bag of words** by assigning a unique ID to every unique ingredient across all moisturizers.


# task 3
ingredient_ids = {}
idx = 0

for i in range(len(moisturizers_dry)):
    ingredient_text = moisturizers_dry.loc[i, 'Ingredients']  # Get the ingredients string
    ingredient_text = ingredient_text.lower()                 # Convert to lowercase
    tokens = ingredient_text.split(', ')                      # Split into individual ingredients

    for token in tokens:
        if token not in ingredient_ids:
            ingredient_ids[token] = idx
            idx += 1

len(ingredient_ids)
list(ingredient_ids.items())[:10]  # Shows first 10 ingredient-ID pairs


[('algae (seaweed) extract', 0),
 ('mineral oil', 1),
 ('petrolatum', 2),
 ('glycerin', 3),
 ('isohexadecane', 4),
 ('microcrystalline wax', 5),
 ('lanolin alcohol', 6),
 ('citrus aurantifolia (lime) extract', 7),
 ('sesamum indicum (sesame) seed oil', 8),
 ('eucalyptus globulus (eucalyptus) leaf oil', 9)]

In [20]:
# Task 4: Creating the Document-Term Matrix

# We create a matrix where:
# - Each row represents a product
# - Each column represents an ingredient
# - A value of 1 means the product contains that ingredient

# This gives us a clean, numerical way to represent product ingredient information.


# #task 4
# Number of products (rows)
n_products = len(moisturizers_dry)

# Number of unique ingredients (columns)
n_ingredients = len(ingredient_ids)

# Create a matrix of zeros
matrix = np.zeros((n_products, n_ingredients))


for i in range(n_products):
    ingredient_text = moisturizers_dry.loc[i, 'Ingredients']
    ingredient_text = ingredient_text.lower()
    tokens = ingredient_text.split(', ')

    for token in tokens:
        if token in ingredient_ids:
            ingredient_index = ingredient_ids[token]
            matrix[i, ingredient_index] = 1


# Convert the matrix to a DataFrame for better readability
ingredient_matrix_df = pd.DataFrame(matrix, columns=ingredient_ids.keys())
ingredient_matrix_df.head()


Unnamed: 0,algae (seaweed) extract,mineral oil,petrolatum,glycerin,isohexadecane,microcrystalline wax,lanolin alcohol,citrus aurantifolia (lime) extract,sesamum indicum (sesame) seed oil,eucalyptus globulus (eucalyptus) leaf oil,...,brazil nut oil,evening primrose oil,buriti oil,arnica oil,rosehip oil,calendula oil,kiwi fruit seed oil,totarol,rosemary leaf extract,natural fragrance (orange blossom & rose gardenia.)
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Task 5: Counting Ingredient Frequencies

# We count how often each ingredient appears across all moisturizers using Python’s `Counter`.

# This helps us understand which ingredients are most commonly used, which may be helpful later for filtering or feature selection.



# task 5
from collections import Counter

ingredient_counter = Counter()

# Count the occurrences of each ingredient in the moisturizers for dry skin
for i in range(len(moisturizers_dry)):
    ingredient_text = moisturizers_dry.loc[i, 'Ingredients']
    ingredient_text = ingredient_text.lower()
    tokens = ingredient_text.split(', ')
    
    ingredient_counter.update(tokens)


ingredient_counter.most_common(10)


[('water', 129),
 ('glycerin', 120),
 ('phenoxyethanol', 100),
 ('butylene glycol', 88),
 ('dimethicone', 75),
 ('disodium edta', 64),
 ('caprylyl glycol', 63),
 ('sodium hyaluronate', 57),
 ('xanthan gum', 54),
 ('citric acid', 52)]

In [22]:
# Task 6: Finalizing the Cosmetic–Ingredient Matrix

# We finalize the binary matrix we created earlier — where each row is a product and each column is an ingredient.

# This matrix is now ready for machine learning: we’ll use it in the next step to reduce dimensions.




# task 6
matrix = np.zeros((n_products, n_ingredients))

for i in range(n_products):
    ingredient_text = moisturizers_dry.loc[i, 'Ingredients'].lower()
    tokens = ingredient_text.split(', ')
    
    for token in tokens:
        if token in ingredient_ids:
            ingredient_index = ingredient_ids[token]
            matrix[i, ingredient_index] = 1

cosmetic_ingredient_matrix = matrix
cosmetic_ingredient_matrix.shape

pd.DataFrame(cosmetic_ingredient_matrix).head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2223,2224,2225,2226,2227,2228,2229,2230,2231,2232
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Task 7: Dimensionality Reduction with t-SNE

# We use t-SNE, a machine learning algorithm, to reduce the matrix to 2 dimensions.

# This allows us to visually map how similar products are, based on their ingredients. Products with similar ingredients will be closer together in the plot.




# task 7
# Create a t-SNE model
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
tsne_results = tsne.fit_transform(cosmetic_ingredient_matrix)

tsne_df = pd.DataFrame(tsne_results, columns=['x', 'y'])
tsne_df.head()


Unnamed: 0,x,y
0,84.786629,178.572891
1,10.324427,55.194786
2,-21.105343,-47.459324
3,151.035172,168.504623
4,189.258362,87.577316


In [24]:
# task 8
%pip install bokeh



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
# Task 8: Visualizing Products with Bokeh

# We create an interactive 2D scatter plot using Bokeh, based on the t-SNE output.

# Each dot represents a moisturizer. We can now visually explore product similarity using this plot.





#task 8
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
output_notebook()


tsne_df['index'] = tsne_df.index  # Create an index column for labeling

source = ColumnDataSource(tsne_df)





In [26]:
p = figure(title="Cosmetic Product Map (Moisturizers for Dry Skin)",
           x_axis_label='TSNE-X',
           y_axis_label='TSNE-Y',
           width=600,
           height=600)

p.circle('x', 'y', size=8, source=source, color='navy', alpha=0.6)
show(p)




In [27]:
#  Task 9: Making the Plot Interactive with Hover

# We add a hover tool to the Bokeh plot.  
# Now, when we move our mouse over a point, we can see the product’s index or name.

# This improves the usability of the visualization for real users.




#task 9
tsne_df['tooltip'] = "Product " + tsne_df['index'].astype(str)
tsne_df['tooltip'] = moisturizers_dry['Name']  # if available


from bokeh.models import HoverTool

source = ColumnDataSource(tsne_df)

hover = HoverTool(tooltips=[("Product", "@tooltip")])
p = figure(title="Cosmetic Product Map with Hover",
           x_axis_label='TSNE-X',
           y_axis_label='TSNE-Y',
           width=800,
           height=600,
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'])

p.circle('x', 'y', size=8, source=source, color='teal', alpha=0.6)
show(p)


HoverTool(tooltips=[("Product", "@tooltip"), ("X", "@x"), ("Y", "@y")])
# Finalize the plot with hover tool



In [28]:
# Task 10: Color-Coding Products by Brand or Category

# We add color to the Bokeh plot using a feature like product brand or label.  
# This helps visually group products and understand ingredient trends across brands or categories.

# We also add a legend so users can click to hide/show brands on the plot.




# task 10
tsne_df['brand'] = moisturizers_dry['Brand']  # Replace with actual column if different


brands = ['Brand A', 'Brand B', 'Brand C']
tsne_df['brand'] = np.random.choice(brands, size=len(tsne_df))  # Just for demo


from bokeh.transform import factor_cmap
from bokeh.palettes import Category10

unique_brands = tsne_df['brand'].unique().tolist()

source = ColumnDataSource(tsne_df)

color_map = factor_cmap('brand', palette=Category10[len(unique_brands)], factors=unique_brands)


In [29]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool

output_notebook()

hover = HoverTool(tooltips=[("Product", "@tooltip"), ("Brand", "@brand")])

p = figure(title="Moisturizers by Brand",
           x_axis_label='TSNE-X',
           y_axis_label='TSNE-Y',
           width=800,
           height=600,
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'])

p.circle('x', 'y', size=10, source=source,
         color=color_map, legend_field='brand', alpha=0.6)

p.legend.location = "top_right"
p.legend.click_policy = "hide"

show(p)




In [30]:
# Task 11: Comparing Two Moisturizers

# We select any two moisturizers and compare their ingredients:
# - Which ingredients are the same?
# - Which are unique to each?

# This is helpful for someone choosing between two products — especially if they have allergies or preferences.





# task 11
product1 = moisturizers_dry.loc[5]
product2 = moisturizers_dry.loc[12]


ingredients1 = set(product1['Ingredients'].lower().split(', '))
ingredients2 = set(product2['Ingredients'].lower().split(', '))


common_ingredients = ingredients1.intersection(ingredients2)
unique_to_product1 = ingredients1 - ingredients2
unique_to_product2 = ingredients2 - ingredients1


print("🟢 Common Ingredients:")
print(common_ingredients)

print("\n🔵 Unique to Product 1:")
print(unique_to_product1)

print("\n🔴 Unique to Product 2:")
print(unique_to_product2)


print(f"\n🧴 Product 1: {product1['Name']}")
print(f"🧴 Product 2: {product2['Name']}")


🟢 Common Ingredients:
{'sodium hydroxide', 'caprylyl glycol', 'cetearyl alcohol', 'xanthan gum', 'glyceryl stearate se', 'glycerin', 'water', 'stearic acid'}

🔵 Unique to Product 1:
{'pentylene glycol', 'ceteareth-20', 'trisodium ethylenediamine disuccinate', 'citric acid', 'passiflora edulis seed oil', 'sodium hyaluronate crosspolymer', 'isopropyl isostearate', 'carbomer', 'schinziophyton rautanenii kernel oil', 'sclerocarya birrea seed oil', 'phenoxyethanol', 'polyglyceryl-6 ximenia americana seedate', 'citrullus lanatus (watermelon) seed oil', 'chlorphenesin', 'caprylic/ capric triglyceride', 'adansonia digitata seed oil', 'ethylhexylglycerin.', 'pseudozyma epicola/camellia sinensis seed oil/glucose/glycine soja meal/malt extract/yeast extract ferment filtrate (pseudozyma epicola/camellia sinensis seed oil/glucose/yeast extract ferment filtrate)', 'plantago lanceolata leaf extract'}

🔴 Unique to Product 2:
{'squalane', 'camellia sinensis leaf extract', 'butyrospermum parkii butter',

In [31]:
# task 12
#  Cosmetic Ingredient Recommender

# ##  Project Summary
# Choosing new skincare products can be hard — especially for people with sensitive or dry skin.  
# Many products contain chemical ingredients that are hard to understand without a science background.

# In this project, I built a **content-based recommendation system** using the ingredients of over 1400 cosmetics from Sephora. The goal is to **compare and visualize products** based on their chemical makeup.

# ---

# ## ✅ What I Did

# - 🔍 Filtered the data for moisturizers that suit dry skin
# - 🧠 Tokenized ingredients and created a bag-of-words model
# - 🧾 Built a product–ingredient matrix
# - 🌀 Applied t-SNE to reduce data to 2D
# - 📊 Created an interactive Bokeh plot to explore product similarity
# - 🧪 Added a hover tool and color-coding to explore brands or labels
# - 🔬 Compared any two products based on their ingredient overlap

# ---

# ##  Tools Used
# - Python (pandas, numpy)
# - scikit-learn (TSNE)
# - Bokeh (interactive plots)
