# Analysis & Visualization of Trending Skincare Products & Ingredients

In [1]:
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import Counter
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

In [2]:
df = pd.read_csv('skincare_products.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,Name,Brand,Price,Link,Details,Ingredients
0,A-Passioni™ Retinol Cream,DRUNK ELEPHANT,$74.00,https://www.sephora.com/product/a-passioni-ret...,"What it is: A clean, cutting-edge formula with...",['\n- Retinol 1%: Improves the look and feel o...
1,Violet-C Brightening Serum 20% Vitamin C + 10%...,TATCHA,$88.00,https://www.sephora.com/product/violet-c-brigh...,What it is: A 20 percent vitamin C and 10 perc...,['\n- Unique 20% Vitamin C Blend: A fast-actin...
2,The Dewy Skin Cream,TATCHA,$68.00,https://www.sephora.com/product/the-dewy-skin-...,What it is: A rich cream that feeds skin with ...,['\n- Japanese Purple Rice: Packed with nutrie...
3,A+ High-Dose Retinol Serum,SUNDAY RILEY,$85.00,https://www.sephora.com/product/a-high-dose-re...,What it is: An advanced-strength retinoid seru...,['\n- Retinol Ester: Helps reduce the appearan...
4,Glow2OH™ Dark Spot Toner,OLEHENRIKSEN,$28.00,https://www.sephora.com/product/glow2oh-dark-s...,"What it is: A potent toner, supercharged with ...",['\n- High-potency AHAs (Glycolic Acid and Lac...


In [4]:
del df['Link']
del df['Details']

In [5]:
df['Price'] = df['Price'].replace({'\$': ''}, regex=True)

#### Create ingredient list based on products' highlighted ingredients

In [6]:
ingredient_list = ['Vitamin C', 'Vitamin A', 'Vitamin E', 'Argan Oil', 'Safflower Oil', 'Sesame Oil', 'Avocado',
              'Grape Seed Oil', 'Rose', 'Retinol', 'White Honey', 'Peptides', 'Vitamin F', 'AHA', 'Angelica Root',
              'Lotus', 'Peach', 'Witch Hazel', 'Lemon', 'Sugarcane', 'Licorice Root', 'Sandalwood', 'Chamomile',
              'Maqui', 'Squalene', 'Viniferine', 'Hyaluronic Acid', 'Butylene Glycol', 'Collagen', 'Ascorbic Acid',
              'Lactic Acid', 'Kojic Acid', 'Alguronic Acid', 'Ceramides', 'Mary Thistle', 'Linoleic Acid', 'Charcoal',
              'Acid Blend', 'Marine Plant Extract', 'Maracuja', 'Antioxidant Blend', 'Trimoist KMF', 'Glacial Glycoprotein',
              'Cerium', 'Algae', 'Kale', 'Spinach', 'Green Tea', 'Manuka Honey', 'PHA', 'Tiger Grass', 'Heather',
              'Eucalyptus', 'Cannabis Sativa', 'Arnica Montana', 'Oregano', 'Hemp Seed', 'Passionfruit', 'Black Tea',
              'Goji', 'Swiss Glacier Water', 'Oil Blend', 'Caffeine', 'Mudar', 'Watermelon', 'Purple Rice', 'Botanical Extract',
              'Bakuchiol', 'Seawater', 'Aloe Vera', 'Meadowfoam', 'Blackcurrant', 'Phytic Acid', 'Pumpkin', 'Papaya', 'Pineapple',
              'Honey', 'Oatmeal', 'Gardenia', 'Thyme', 'Salicylic Acid', 'Sulfur', 'Calamine', 'Marula', 'Matcha',
              'Kombutcha', 'Cactus', 'Niacinamide', 'Revitelix', 'SymRelief', 'Saponins', 'Mushroom', 'Coconut',
              'Caviar Lime', 'Hot Pepper', 'Eyebright', 'Lavender', 'Magnesium', 'Tara', 'Goat Milk', 'Kinetin', 'Lipids',
              'Palmitoyl', 'Irish Moor Mud', 'Cucumber', 'Hydroxyapatite', 'Orange', 'Glycolic Acid', 'Grapefruit',
              'Hydrocolloid', 'Peppermint', 'Apple', 'Gooseberry', 'Arbutin', 'Alfalfa', 'Adaptogen', 'Ascorbate', 'Bifidus',
              'Silk', 'Cocoa Butter', 'Almond', 'Marshmallow', 'White Tea', 'Shea Butter', 'Benzoyl Peroxide', 'Allantoin',
              'Ivory Palm Seed', 'Apricot', 'Jojoba Seed', 'Ginseng', 'Amino Acid', 'Chestnut', 'Beetroot', 'Tea Tree',
              'Zinc Oxide', 'Sunflower', 'Bearberry', 'Sophora', 'Hydroquinone', 'Sea Buckthorn Berry', 'Citric Acid', 'Pomegranate',
              'Diamond', 'Avobenzone', 'Bilboa', 'Elastin', 'Octocrylene', 'Octisalate']

#### Loop through product ingredients to find ingredients in ingredient_list and add them to a new list

In [7]:
product_ingredients = df['Ingredients'].tolist()
found_ingredients = []
for i in range(len(product_ingredients)):
    match = [x for x in ingredient_list if x in product_ingredients[i]]
    found_ingredients.append(match)

found_ingredients_list = list(itertools.chain(*found_ingredients))

#### Count frequency of ingredients

In [8]:
count = Counter(found_ingredients_list)

#### Sort by most common ingredients and get top ten ingredients

In [9]:
df2 = pd.DataFrame.from_dict(count, orient='index').reset_index()
df2 = df2.rename(columns={'index':'Ingredients', 0:'Count'})
df2 = df2.sort_values(['Count'], ascending=False)
top_ingredients = df2.iloc[0:10]

In [10]:
top_ingredients

Unnamed: 0,Ingredients,Count
5,Hyaluronic Acid,27
2,Vitamin C,14
29,Collagen,12
36,Salicylic Acid,10
118,Benzoyl Peroxide,9
17,Glycolic Acid,9
60,Green Tea,8
15,Chamomile,8
0,Retinol,7
73,Peptides,7


#### Repeat the process for brands

In [11]:
count_brands = Counter(df.Brand)
df3 = pd.DataFrame.from_dict(count_brands, orient='index').reset_index()
df3 = df3.rename(columns={'index':'Brand', 0:'Count'})
df3 = df3.sort_values(['Count'], ascending=False)
top_brands = df3.iloc[0:10]

In [12]:
top_brands

Unnamed: 0,Brand,Count
27,MARIO BADESCU,22
45,PROACTIV,20
14,PETER THOMAS ROTH,6
4,FRESH,6
16,SEPHORA COLLECTION,6
29,OBAGI CLINICAL,6
23,TARTE,5
0,DRUNK ELEPHANT,4
6,KIEHL'S SINCE 1851,4
12,GLAMGLOW,4


#### Get all products that contain the top ten ingredients

In [13]:
top_ten_ingredients = ['Hyaluronic Acid', 'Vitamin C', 'Collagen', 'Salicylic Acid',
'Benzoyl Peroxide', 'Glycolic Acid', 'Green Tea', 'Chamomile', 'Retinol', 'Peptides']

In [14]:
trending_ingredients = []
for i in range(len(product_ingredients)):
    match = [x for x in top_ten_ingredients if x in product_ingredients[i]]
    trending_ingredients.append(match)

In [15]:
top_products = pd.DataFrame(columns=['Name', 'Price', 'Trending Ingredients'])
top_products.Name = df.Name
top_products.Price = df.Price
top_products = top_products.dropna(subset=['Name'])
top_products = top_products.reset_index(drop=True)

trending_ingredients_list = pd.Series(trending_ingredients)
top_products['Trending Ingredients'] = trending_ingredients_list
top_products = top_products[top_products['Trending Ingredients'].map(lambda d: len(d)) > 0]

In [16]:
top_products.head()

Unnamed: 0,Name,Price,Trending Ingredients
0,A-Passioni™ Retinol Cream,74.0,[Retinol]
1,Violet-C Brightening Serum 20% Vitamin C + 10%...,88.0,[Vitamin C]
2,The Dewy Skin Cream,68.0,[Hyaluronic Acid]
3,A+ High-Dose Retinol Serum,85.0,[Retinol]
4,Glow2OH™ Dark Spot Toner,28.0,"[Glycolic Acid, Chamomile]"


## Data visualizations using plotly

In [17]:
plt.rcParams['figure.figsize'] = [20, 10]

#### Bar chart plotting trending skincare products against prices

In [18]:
trace_products = go.Bar(x=top_products.Name,
                  y=top_products.Price,
                  marker=dict(color='#7F11FB'))

data = [trace_products]

layout = go.Layout(title="Trending Skincare Products",
                yaxis=dict(title='Price'))

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='top_products')

#### Bar chart plotting top ten skincare ingredients against counts

In [19]:
trace_ingredients = go.Bar(x=top_ingredients.Ingredients,
                  y=top_ingredients.Count,
                  name='Ingredients',
                  marker=dict(color='#06C45D'))

data = [trace_ingredients]

layout = go.Layout(title="Top Ten Skincare Ingredients",
                xaxis=dict(title='Ingredients'),
                yaxis=dict(title='Count'))

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='top_ingredients')

#### Bar chart plotting top ten skincare brands against counts

In [20]:
trace_brands = go.Bar(x=top_brands.Brand,
                  y=top_brands.Count,
                  name='Brands',
                  marker=dict(color='#F25207'))

data = [trace_brands]

layout = go.Layout(title="Top Ten Skincare Brands",
                xaxis=dict(title='Brands'),
                yaxis=dict(title='Count'))

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='top_brands')