# Understanding Your Skin Care Product Ingredients 

## Part I.  Import Modules 

In [1236]:
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException

In [2]:
import statistics  as stat
import pandas as pd
import numpy as np 
import re
fromsklearn.manifold import TSNE

In [4]:
root = sys.path[0]

Notice: Please download the right driver for your browser. [Click to see the guide on driver Installation](https://selenium-python.readthedocs.io/installation.html)

In [1246]:
chrome_path = root + '/chromedriver'
driver_test = webdriver.Chrome(executable_path = chrome_path)
driver_test.close()

## Part II.  Web Scraping 

#### 1. Brand Name, Product Name, Sub-url

In [3]:
def extractinfo(driver, brand, product, suburl):
    '''
    driver: call the webdrivrer
    brand : brand name of products
    product : product name 
    suburl: the urls of product description page 
    '''    
    xpath =  '//a[@class="review-brand"]'
    for elem in driver.find_elements_by_xpath(xpath):
        brand.append( elem.text)     
    
    xpath2 = '//a[@class="review-product"]'    
    for elem in driver.find_elements_by_xpath(xpath2):
        product.append(elem.text) 
        suburl.append(elem.get_attribute('href') )
    
    return brand, product, suburl

In [4]:
# This function will scroll down the page by number of times defined by user. 
def scrollDown(driver, n_scroll):
    body = driver.find_element_by_tag_name("body")
    while n_scroll >= 0:
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(2)
        n_scroll -= 1
    return driver

In [5]:
def extract_all_items( driver, tck):  
    
    # initialize the page and the list to return   
    url_open = 'https://www.beautypedia.com/skin-care/' + tck + '/?size=96'
    driver.get(url_open)
    brandname,productname,suburls = [],[],[]
    
    # collect information on the first page  
    t0 = time.time()  
    brandname, productname, suburls  = extractinfo(driver, brandname,productname,suburls)  
    
    # find the number of pages to be read in total 
    sel_num = Select( driver.find_element_by_xpath('//div[@class="page-number"]/select') ) 
    num_pages  = len(sel_num.options)
    
    # read from 2nd page to last page  
    for i in range(num_pages-1):  
        # continue to next page 
        t0 = time.time()
        click_next_xpath = '//div[@class="page-buttons"]/a[@class="page-button next-page"]'
        btn =  driver.find_element_by_xpath(click_next_xpath) 
        btn.click()
        time.sleep(2)  # wait for fully loading the page      
        browser = scrollDown(driver, 10)
        brandname, productname, suburls  = extractinfo(driver, brandname, productname, suburls)
        time.sleep(2)     
    print("Read {} Pages and Finished in {} seconds".format(num_pages,time.time() - t0))
    return brandname,productname,suburls

In [7]:
# skin care product type that I am interested in 
ticker = ['moisturizer/daytime-moisturizer-reviews', 'moisturizer/nighttime-moisturizer-reviews',
         'eyes/eye-cream-and-treatment-reviews']

In [8]:
# day moisture 
browser = webdriver.Chrome(executable_path = chrome_path)
brand_names_1, product_names_1, sub_urls_1  = extract_all_items(browser, ticker[0]) 

Start Page 1
Done Page 1
Start Page 2
Done Page 2
Start Page 3
Done Page 3
Start Page 4
Done Page 4
Start Page 5
Done Page 5


In [14]:
# night moisture
browser = webdriver.Chrome(executable_path = chrome_path)
brand_names_2, product_names_2, sub_urls_2  = extract_all_items(browser, ticker[1]) 

Start Page 1
Done Page 1
Start Page 2
Done Page 2
Start Page 3
Done Page 3
Start Page 4
Done Page 4
Start Page 5
Done Page 5
Start Page 6
Done Page 6
Start Page 7
Done Page 7
Start Page 8
Done Page 8
Start Page 9
Done Page 9
Start Page 10
Done Page 10
Start Page 11
Done Page 11
Start Page 12
Done Page 12
Start Page 13
Done Page 13


In [21]:
# eye cream 
browser = webdriver.Chrome(executable_path = chrome_path)
brand_names_3, product_names_3, sub_urls_3  = extract_all_items(browser, ticker[2]) 

Start Page 1
Done Page 1
Start Page 2
Done Page 2
Start Page 3
Done Page 3
Start Page 4
Done Page 4
Start Page 5
Done Page 5


In [22]:
# save results to csv 
df_1 = pd.DataFrame(np.column_stack([brand_names_1, product_names_1, sub_urls_1]), columns=['brand_name', 'product_name', 'suburl'])
df_1.to_csv('day_moistrue.csv',index = False, encoding='utf-8')

df_2 = pd.DataFrame(np.column_stack([brand_names_2, product_names_2, sub_urls_2]), columns=['brand_name', 'product_name', 'suburl'])
df_2.to_csv('night_moistrue.csv',index = False, encoding='utf-8')
 
df_3 = pd.DataFrame(np.column_stack([brand_names_3, product_names_3, sub_urls_3]), columns=['brand_name', 'product_name', 'suburl'])
df_3.to_csv('eye_cream.csv',index = False, encoding='utf-8')

#### 2. Price, Size, Rating, Ingredients

In [604]:
# Load saved csv files 
day_moist = pd.read_csv('day_moistrue.csv')
night_moist = pd.read_csv('night_moistrue.csv')
eye_cream = pd.read_csv('eye_cream.csv')
# display(day_moist.sample(5)),display(night_moist.sample(5)),display(eye_cream.sample(5))

function append_new_col is used to extract details of a single product, i.e. price, rating, ingredients 

In [26]:
def append_new_col(driver, df):   
    # initialize the list 
    price, rating, ingredients = [], [],[]     
    # all xpaths 
    xpath2 = '//div[@class="pricing-info"]/span[@class="price"]'
    xpath3 = '//div[@class="expert-rating"]/span'
    xpath4 = '//h3[@class="tab-title ingredients"]'
    xpath5 = '//div[@class="ingredients"]/div'
    
    for i in list(df.index):
        # access the url 
        driver.get(df.suburl[i])
        time.sleep(1) 
        # extract price info 
        price.append(driver.find_element_by_xpath(xpath2).text)
        # extract rating
        rating.append(driver.find_element_by_xpath(xpath3).get_attribute("class")[-1]) 
        # find the expand button, click and extract ingredients info 
        try:
            btn = driver.find_element_by_xpath(xpath4)
            driver.execute_script("arguments[0].click();", btn)
            btn = driver.find_element_by_xpath(xpath4)
            time.sleep(2) 
            ingredients.append(driver.find_element_by_xpath(xpath5).text) 
        except NoSuchElementException:
            ingredients.append('No Info') 
    return  price, rating, ingredients    

To save processing time, instead of using the whole datasets, I deceied to 

For daytime moisture and eyecream, I use whole datasets in  the function  append_new_col
however, after noticing that the running time is  too long( > 30 miniutes ) 
and noticesthe nighttime moisture has over 1000 entries, which is larger than the sum of day moisture and eyecream
so I decided to randomly sample 250 from night moisture cream

In [10]:
# ! Notice: The folllowing 3 cells take a long time to run 
browser = webdriver.Chrome(executable_path = chrome_path)
day_moist_price, day_moist_rating, day_moist_ingredients = append_new_col(browser, day_moist)

In [11]:
browser = webdriver.Chrome(executable_path = chrome_path)
eye_cream_price, eye_cream_rating, eye_cream_ingredients = append_new_col(browser, eye_cream)

In [27]:
night_moist_part = night_moist.sample(n = 250, replace=False, random_state=1)
browser = webdriver.Chrome(executable_path = chrome_path)
night_moist_price, night_moist_rating, night_moist_ingredients = append_new_col(browser, night_moist_part)

In [757]:
to_add = pd.DataFrame(np.column_stack([day_moist_price, day_moist_rating, day_moist_ingredients]), columns=['price', 'rating', 'ingredients'])
day_moist_2 = pd.concat([day_moist, to_add], axis = 1)
day_moist_2['label'] =  'DM' # DM: abbrev for Daytime Moisture 
#day_moist_2.to_csv('day_moistrue_2.csv',index = False, encoding='utf-8')

In [758]:
to_add = pd.DataFrame(np.column_stack([eye_cream_price, eye_cream_rating, eye_cream_ingredients]), columns=['price', 'rating', 'ingredients'])
eye_cream_2 = pd.concat([eye_cream, to_add], axis = 1)
eye_cream_2['label'] =  'EC' # EC: abbrev for Eye Cream
#eye_cream_2.to_csv('eye_cream_2.csv',index = False, encoding='utf-8')

In [759]:
to_add = pd.DataFrame(np.column_stack([night_moist_price, night_moist_rating, night_moist_ingredients]), columns=['price', 'rating', 'ingredients'])
night_moist_2= pd.concat([night_moist_part.reset_index(drop=True), to_add], axis = 1)
night_moist_2['label'] =  'NM' # NM: abbrev for Night= Moisture 
# night_moist_2.to_csv('night_moisture_2.csv',index = False, encoding='utf-8')

In [760]:
cosme = pd.concat([day_moist_2, eye_cream_2],axis = 0 )
cosme = pd.concat([cosme, night_moist_2],axis = 0 )
cosme.to_csv('cosmetic.csv',index = False, encoding='utf-8')

## Part III. Preprocessing 

#### References 
https://www.datacamp.com/community/tutorials/introduction-t-sne 

In [1207]:
# Load saved csv files 
cosm = pd.read_csv('cosmetic.csv')
#display(cosm.sample(10))

In [1208]:
# drop suburls, remove duplicate rows by product name , and drop products with no ingredients info 
cosm = cosm.drop(columns=['suburl'])
cosm.drop_duplicates(subset ="product_name",  keep = False, inplace = True) 
cosm = cosm.dropna(how='any', subset=['ingredients'])

In [1209]:
cosm[cosm.brand_name == 'BeautiControl']

Unnamed: 0,brand_name,product_name,price,rating,ingredients,label
230,BeautiControl,BC Spa Facial Defend & Restore Moisture Creme ...,34.0,2,"Active Ingredients: Homosalate, Oxybenzone, Oc...",DM
232,BeautiControl,BC Spa Facial Defend & Restore Moisture Lotion...,34.0,4,"Active Ingredient: Titanium Dioxide, Other Ing...",DM
575,BeautiControl,"Regeneration Tight, Firm & Fill Extreme Eye Co...",65.0,1,"Water, Glycerin, Magnesium Aluminum Silicate, ...",EC
671,BeautiControl,"Regeneration Tight, Firm & Fill Eye Firming Serum",45.0,5,"Water, Glycerin, Hexapeptide-10, Palmitoyl Tri...",EC
812,BeautiControl,BC Spa Solutions Under Eye Dark Circle & Puffi...,30.0,1,"Water, Butylene Glycol, Propylene Glycol, Glyc...",EC
814,BeautiControl,BC Spa Facial Restructuring Eye Creme,35.0,1,"Water, Caprylic/Capric Triglyceride, Butylene ...",EC
815,BeautiControl,Platinum Regeneration Rejuvenating Eye Treatment,42.0,1,"Water, Glycerin, Sodium Polyacrylate, Dipropyl...",EC
893,BeautiControl,BC Spa Facial Defend & Restore Night Creme,38.0,1,"Water, Caprylic/Capric Triglyceride, Glycerin,...",NM
1037,BeautiControl,Skinlogics Essentials Moisturizer,28.0,3,"Water, C12-15 Alkyl Benzoate, Caprylic/Capric ...",NM


For this project, I restrain my analysis on the products whose prices is above the median price within its categorry 

In [820]:
# find the median price for each category 
med_price_dict  = {t: stat.median(cosm.loc[cosm['label'] == t, 'price'])  for t in cosm.label.unique() }

In [821]:
# subset the cosm dataframe 
sub_cosm  = pd.DataFrame(columns= list(cosm.columns))
for k in med_price_dict.keys():
    subs = cosm[(cosm.label == k ) & (cosm.price >=  med_price_dict.get(k) )]    
    sub_cosm = pd.concat([sub_cosm, subs], axis = 0 )
sub_cosm = sub_cosm.reset_index(drop = True)
# sub_cosm.shape

In [1229]:
ingrednt = [] 
for string in sub_cosm['ingredients']:
    #pattern4 = re.compile(r"|\(|\)")
    #print(string, "\n")
    pattern1 = re.compile(r"\s*\([^\)]+\)") # remove parentheses and text within it 
    string = re.sub(pattern1, '', string)
    pattern2 = re.compile(r"\w+\s*\w+:\s") # remove the words like "Active Ingredients" and parentheses
    string = re.sub(pattern2, '', string)
    pattern3  = re.compile(r"\w+\.*\w*\s*%") # remove percentages 
    string = re.sub(pattern3, '', string)    
    pattern4 = re.compile(r";")   # remove 
    string = re.sub(pattern4, ', ', string)
    pattern5 = re.compile(r"\.{1}\s+") 
    string = re.sub(pattern5, ', ', string)
    pattern6 = re.compile(r"\.$")    
    string = re.sub(pattern6, '', string)
    pattern7 = re.compile(r"\\\s{1}|\/\s{1}")    
    string = re.sub(pattern7, '', string)
    pattern8 = re.compile(r"\\|\/")  
    string = re.sub(pattern8, ' ', string)
    #string = re.sub(pattern8, '', string)
    #print(string, "\n")    
    ingrednt.append(string)

In [995]:
# Initialize dictionary, list, and initial index
ingredient_idx = {}  #  length of dictionary  indicates how many different ingredients are there  
corpus = []  
idx = 0

# For loop for tokenization
for i in range(len(ingrednt)):
    ingredients = ingrednt[i]  
    
    # token is a list of ingredients for each product 
    split_to_list = ingredients.split(',')
    tokens = [t.lower().strip(' \t\n\r') for t in  split_to_list  ]   
    tokens = [t for t in tokens if len(t)>1 ] # keep those words whose length >1 
    corpus.append(tokens)   # corpus is a list of tokens 
    
    for ingredient in tokens:
        
            
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx              
            idx += 1
            
        #else:
            #ingredient_idx[ingredient] += 1  
#sorted (ingredient_idx)

In [1231]:
sorted (ingredient_idx)

['2 hexanediol',
 '2 hexanediol& sodium hyaluronate & aqua',
 '2- hexanediol',
 '2-0-ethyl ascorbic acid',
 '2-hexanediol',
 '2-oleamido-1',
 '2-oleamindo-1',
 '2-olemido-1',
 '3-aminopropane sulfonic acid',
 '3-hexenol',
 '3-mthyl-4--3-butne-2-one',
 '3-o-ethyl ascorbic acid',
 '3-octadecandiol',
 '3-octadecanediol',
 '3-octadedecanediol',
 '4-t-butylcyclohexanol',
 '6-naphthalate',
 '7-dehydrocholesterol',
 '7-dehydrocholestriol',
 '? titanium dioxide',
 '?cetearyl glucoside',
 '?glycine',
 '?pentylene glycol',
 '?xanthan gum',
 '[+ - mica',
 '[+ -: iron oxides]',
 'abies sibirica oil',
 'acacia dealbata flower wax',
 'acacia decurrens flower wax',
 'acacia decurrens jojoba sunflower seed wax polyglyceryl-3 esters',
 'acacia farnesiana flower wax',
 'acacia senegal',
 'acacia senegal gum',
 'acacia senegal gum extract',
 'acanthopanax senticosus root extract',
 'acer saccharinum extract',
 'acer saccharum extract',
 'acetic acid',
 'acetyl carnitine hcl',
 'acetyl citrull amido argin

In [1005]:
# Get the number of items and tokens 
M = len(sub_cosm) # M： number of cosmetic 
N =  len(ingredient_idx) # number of ingredients

# Initialize a matrix of zeros
A =  np.zeros(shape = (M, N))

In [1007]:
# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for t in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[t]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

In [1009]:
# Make a document-term matrix
# with column represent each ingredient and row represent the each cosmetic
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i +=1

In [1011]:
# Dimension reduction with t-SNE
model = TSNE(n_components = 2, learning_rate = 200)
tsne_features = model.fit_transform(A)
# Make X, Y columns 
sub_cosm['X'] = tsne_features[:, 0]
sub_cosm['Y'] = tsne_features[:, 1]

In [1055]:
sub_cosm.head(20)

Unnamed: 0,brand_name,product_name,price,rating,ingredients,label,X,Y
0,Supergoop!,Superscreen Daily Moisturizer Broad Spectrum S...,38.0,3,"Active: Avobenzone (2.5%), Homosalate (8%), Oc...",DM,3.455034,-1.201331
1,Bare Minerals (Bare Escentuals),Skinlongevity Vital Power Moisturizer Broad Sp...,36.0,4,Active Ingredients: Titanium Dioxide 3.28%; Zi...,DM,34.614559,-8.112528
2,Estee Lauder,Perfectionist Pro MultiDefense UV Fluid Broad ...,47.0,4,Active Ingredients: Titanium Dioxide 6.3%; Zin...,DM,27.416128,-6.382663
3,Clinique,For Men Super Energizer Anti-Fatigue Hydrating...,45.0,4,"Active: Avobenzone 3%, Octisalate 5%. Inactive...",DM,16.09358,13.575961
4,Vichy,Liftactiv Supreme Day,42.0,2,"Aqua/Water, Glycerin, Dimethicone, Rhamnose, I...",DM,-11.300848,10.473397
5,Biossance,Squalane + Mineral SPF 45,39.0,3,"Active Ingredients: Zinc Oxide 7%, Titanium Di...",DM,14.801427,-4.024587
6,Bare Minerals (Bare Escentuals),Complexion Rescue Defense Radiant Protective V...,39.0,4,"Active: Titanium Dioxide 3.28%, Zinc Oxide 12....",DM,34.639011,-8.111431
7,Clinique,Clinique Smart Broad Spectrum SPF 15 Custom-Re...,54.0,2,"Active Ingredients: Avobenzone 3.0%, Octisalat...",DM,2.150798,13.626324
8,Mary Kay,TimeWise Age Minimize 3D Day Cream SPF 30 Broa...,32.0,3,Active Ingredients: Avobenzone 3%; Homosalate ...,DM,-13.761141,16.39855
9,Mary Kay,TimeWise Age Minimize 3D Day Cream SPF 30 Broa...,32.0,3,Active Ingredients: Avobenzone 3%; Homosalate ...,DM,-13.763421,16.400072


## Part IV. Visualization 

In [1223]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, Select 
from ipywidgets import interact 

In [1224]:
output_notebook()

In [1225]:
# Make the ColumnDataSource: source
source = ColumnDataSource( sub_cosm ) 

# Save the minimum and maximum values of the fertility column: xmin, xmax
xmin, xmax = min(sub_cosm.X), max(sub_cosm.X)

# Save the minimum and maximum values of the life expectancy column: ymin, ymax
ymin, ymax = min(sub_cosm.Y), max(sub_cosm.Y)

# Create the figure: plot
plot = figure(x_axis_label = 'T-SNE 1', y_axis_label = 'T-SNE 2',
             plot_height=400, plot_width=500) # ,x_range=(xmin, xmax), y_range=(ymin, ymax)) 
# plot = figure(tools ='box_select, lasso_select')
plot = figure(tools ='lasso_select, reset')
# Add circle glyphs to the plot
plot.circle(x= 'X', y='Y', alpha = 0.8, source = source)
#plot.circle(x= 'X', y='Y',selection_color = 'red', nonselection_fill_alpha = 0.2, 
           # nonselection_fill_color = 'grey', source=source)

In [1226]:
hover = HoverTool(tooltips = [
        ('brand', ' @brand_name'),
        ('name', ' @product_name'),
        ('price', ' $ @price'),
        ('rank', ' @rating')])
plot.add_tools(hover)

In [1227]:
# Define the callback function: update_plot
option_1 = ['Daytime Moisturizer','Nighttime Moisturizer','Eye Cream']
def update(Category = option_1[0]):
    # Set the yr name to slider.value and new_data to source.data
   #  opt = Category
    if Category == 'Daytime Moisturizer': opt = 'DM'
    elif Category == 'Eye Cream': opt = 'EC'
    else: opt = 'NM'
    df = sub_cosm[sub_cosm['label'] == opt]
    new_data = {
        'X' : df['X'],
        'Y' : df['Y'],
        'product_name' : df['product_name'],
        'brand_name' : df['brand_name'],
         'price' : df['price'],
         'rating' : df['rating']
    }
    
    source.data = new_data
    push_notebook() 

# add a select 
#menu = Select(options = ['DM','NM','EC'], value='DM', title = 'Category')

#menu.on_change('value', callback)
#layout = column(menu, plot)
# Attach the callback to the 'value' property of slider
#slider.on_change('value', update_plot)

In [1228]:
interact(update,  Category = option_1)
show(plot, notebook_handle = True)

interactive(children=(Dropdown(description='Category', options=('Daytime Moisturizer', 'Nighttime Moisturizer'…

In [582]:
#pattern4 = re.compile(r"\s*\(\w*\s*\w*\)\s*")
#pattern4.findall(string)
#string = re.sub(pattern4, '', string)

In [None]:
# https://www.paulaschoice.com/ingredient-dictionary?crefn1=name-first-letter&crefv1=B 
# ingredient glossary 
# //h2[@class="name ingredient-name"]