# Understanding Your Skin Care Product Ingredients 

## Part I.  Import Modules 

In [5]:
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException

In [6]:
import statistics  as stat
import pandas as pd
import numpy as np 
import re
from sklearn.manifold import TSNE

In [7]:
root = sys.path[0]

Notice: Please download the right driver for your browser. [Click to see the guide on driver Installation](https://selenium-python.readthedocs.io/installation.html)

In [4]:
chrome_path = root + '/chromedriver'
# test if webdriver is installed correctly
#driver_test = webdriver.Chrome(executable_path = chrome_path)

## Part II.  Web Scraping 

#### 1. Find infomation on product's name, brand,  price, rating, ingredients

##### class scrape_by_category
- class scrape_by_category has instance variables: 
    - brandname, productname,price,rating,ingredient - infomation for each product 
    - suburl - the product detail page 
- define methods 
    - `__init__`(self, driver,tck, sec, n_scroll)
        - 1st arg driver - call the browser       
        - 2nd arg tck - specify the category you want to scarpe 
        - 3rd arg sec - specify the seconds to hault the flow of the program,reduce it can decrease 
          processing time 
        - 4th arg n_scroll - specify the number of times when scroll down the each page, reduce it can decrease 
          processing time 
        - brandname,productname,suburl,price, rating,ingredient are all initialized to be empty list 
    - `scrollDown`(self)
        - scroll down the pages by number of times defined by user
    - `extractinfo`(self)
        - for each category page, extract info on products general info  
    - `extract_all_items`(self)
        - extract detailed information from each product's own page 

In [21]:
class scrape_by_category:
    
    def __init__(self, driver,tck, sec, n_scroll):
        self.driver= driver
        self.sec = sec
        self.tck = tck
        self.brandname,self.productname, self.suburl = [], [] ,  [] 
        self.price, self.rating, self.ingredient = [], [],[]     
        self.n_scroll = n_scroll
        self.url_open = 'https://www.beautypedia.com/skin-care/' + tck + '/?size=96'
    
    def scrollDown(self):
        body = self.driver.find_element_by_tag_name("body")
        while self.n_scroll >= 0:
            body.send_keys(Keys.PAGE_DOWN)
            time.sleep(self.sec)
            self.n_scroll -= 1   
    
    def extractinfo(self):
        xpath =  '//a[@class="review-brand"]'
        for elem in self.driver.find_elements_by_xpath(xpath):
            self.brandname.append(elem.text)     
    
        xpath2 = '//a[@class="review-product"]'    
        for elem in self.driver.find_elements_by_xpath(xpath2):
            self.productname.append(elem.text) 
            self.suburl.append(elem.get_attribute('href') )
            
        return self.brandname, self.productname, self.suburl
    
    def extract_all_items(self):    
        
        self.driver.get(self.url_open)
        # open the first page 
        self.scrollDown()
        self.brandname, self.productname, self.suburl  = self.extractinfo()  
    
        #record the number of pages to be read in total 
        sel_num = Select( self.driver.find_element_by_xpath('//div[@class="page-number"]/select') ) 
        num_pages  = len(sel_num.options)
       
         # for eyecream : look at at most 5 pages to  save processing time 
        # if num_pages >=5: num_pages = 5
    
        # for nighttime and eyecream : only look at 3 pages to  save processing time 
        if num_pages >=3: num_pages = 3
            
        # from 2nd page to last page  
        for i in range(num_pages-1):  
            click_next_xpath = '//div[@class="page-buttons"]/a[@class="page-button next-page"]'
            btn =  self.driver.find_element_by_xpath(click_next_xpath) 
            btn.click()
            time.sleep(self.sec)   
            self.scrollDown()
            self.brandname, self.productname, self.suburl  = self.extractinfo( )  
            time.sleep(self.sec)     
    
    def product_details_extract(self):   
        # call the function to get suburls 
        self.extract_all_items()
    
        # define xpaths 
        xpath2 = '//div[@class="pricing-info"]/span[@class="price"]'
        xpath3 = '//div[@class="expert-rating"]/span'
        xpath4 = '//h3[@class="tab-title ingredients"]'
        xpath5 = '//div[@class="ingredients"]/div'
    
        for i in range(len(self.suburl)):
            # access the url 
            self.driver.get(self.suburl[i])
            time.sleep(self.sec) 
            # extract price info 
            self.price.append(self.driver.find_element_by_xpath(xpath2).text)
            # extract rating
            self.rating.append(self.driver.find_element_by_xpath(xpath3).get_attribute("class")[-1]) 
            # find the expand button, click and extract ingredients info 
            try:
                btn = self.driver.find_element_by_xpath(xpath4)
                self.driver.execute_script("arguments[0].click();", btn)
                btn = self.driver.find_element_by_xpath(xpath4)
                time.sleep(self.sec) 
                self.ingredient.append(self.driver.find_element_by_xpath(xpath5).text) 
            except NoSuchElementException:
                self.ingredient.append('No Info') 
        return self.brandname, self.productname, self.price, self.rating, self.ingredient  

In [6]:
# skin care product type that I am interested in 
ticker = ['moisturizer/daytime-moisturizer-reviews', 'moisturizer/nighttime-moisturizer-reviews',
         'eyes/eye-cream-and-treatment-reviews']

In [7]:
browser = webdriver.Chrome(executable_path = chrome_path)
dm = scrape_by_category(browser, ticker[0],2, 8) 
t0 = time.time()  
brand_names_1, product_names_1, price_1, rating_1, ingred_1  = dm.product_details_extract()
print("Finished in {} seconds".format( time.time() - t0))

Finished in 5108.163042068481 seconds


Notice: After runing the above block, I noticed that it takes more than 1 hour to extract inforamtion for about 500 products; therefore when deal with nighttime and product I used 3*96 items, and reduced number of scroll from 8 to 5 then to 2 to further reduce processing time. 

In [26]:
# browser = webdriver.Chrome(executable_path = chrome_path)

In [22]:
nm = scrape_by_category(browser, ticker[1],2,5) 
t0 = time.time()  
brand_names_2, product_names_2, price_2, rating_2, ingred_2  = nm.product_details_extract()
print("Finished in {} seconds".format( time.time() - t0))

Finished in 3165.4116501808167 seconds


In [28]:
ec = scrape_by_category(browser, ticker[2],2,2) 
t0 = time.time()  
brand_names_3, product_names_3, price_3, rating_3, ingred_3  = ec.product_details_extract()
print("Finished in {} seconds".format( time.time() - t0))

Finished in 3001.537071943283 seconds


In [29]:
browser.close()

In [30]:
len(brand_names_1), len(brand_names_2),len(brand_names_3)

(411, 288, 288)

In [48]:
# save results to csv 
df_1 = pd.DataFrame(np.column_stack([brand_names_1, product_names_1, price_1, rating_1, ingred_1]))
df_1['label'] = 'DM'


df_2 = pd.DataFrame(np.column_stack([brand_names_2, product_names_2, price_2, rating_2, ingred_2 ]))
df_2['label'] = 'NM'
 
df_3 = pd.DataFrame(np.column_stack([brand_names_3, product_names_3, price_3, rating_3, ingred_3 ]))

df_3['label'] = 'EC'

cosmetic = pd.concat([df_1, df_2],axis = 0 )
cosmetic = pd.concat([cosmetic, df_3],axis = 0 )
cosmetic.columns  = ['brand_name', 'product_name', 'price', 'rating','ingredients', 'label']
cosmetic.to_csv('cosmetic.csv',index = False, encoding='utf-8')

## Part III. Preprocessing 

In [8]:
# Load saved csv files 
cosm = pd.read_csv('cosmetic.csv')
#cosm.shape
display(cosm.sample(10))

Unnamed: 0,brand_name,product_name,price,rating,ingredients,label
631,Laneige,Water Bank Gel Cream,32.0,2,"Water, Butylene Glycol, Betaine, Glycerin, Dim...",NM
696,Clarins,ClarinsMen Line-Control Cream,53.0,2,"Aqua/Water/Eau, Isononyl Isononanoate, Cyclome...",NM
657,Garnier Nutritioniste,Ultra-Lift Miracle Sleeping Cream Anti-Age + A...,17.99,1,"Water, Dimethicone, Glycerin, Simmondsia Chine...",NM
647,philosophy,renewed hope in a jar,47.0,2,"Aqua/Water/Eau, Cyclopentasiloxane, Stearic Ac...",NM
232,BeautiControl,BC Spa Facial Defend & Restore Moisture Lotion...,34.0,4,"Active Ingredient: Titanium Dioxide, Other Ing...",DM
785,Beautycounter,Any Time Eye Cream,32.0,2,"Water (Aqua), Caprylic/Capric Triglyceride, Al...",EC
668,Nuance Salma Hayek,Age Affirm Firming Day & Night Cream,21.99,1,"Water( Aqua), Glycerin, Dimethicone, Caprylic/...",NM
129,Kate Somerville,Daily Deflector Waterlight Broad Spectrum 50+ ...,48.0,5,"Active ingredients: Titanium Dioxide 8.3%, Zin...",DM
643,Estee Lauder,Enlighten Even Skintone Correcting Crème,55.0,2,"WaterAquaEau, Phenyl Trimethicone, Butylene Gl...",NM
744,Lancome,Energie de Vie Eye Illuminating & Cooling Anti...,39.0,1,"Water, Glycerin, Alcohol Denat., Niacinamide, ...",EC


In [9]:
# remove duplicate rows by product name and drop products with no ingredients info 
cosm.drop_duplicates(subset ="product_name",  keep = False, inplace = True) 
cosm = cosm.dropna(how='any', subset=['ingredients'])

In [10]:
# Only focus analysis on products which has price above its group's median price 
med_price_dict  = {t: stat.median(cosm.loc[cosm['label'] == t, 'price'])  for t in cosm.label.unique() }
sub_cosm  = pd.DataFrame(columns= list(cosm.columns))
for k in med_price_dict.keys():
    subs = cosm[(cosm.label == k ) & (cosm.price >=  med_price_dict.get(k) )]    
    sub_cosm = pd.concat([sub_cosm, subs], axis = 0 )
sub_cosm = sub_cosm.reset_index(drop = True)

In [11]:
sub_cosm.shape

(503, 6)

In [12]:
# clean ingredients text data, remove unnecessary parts for clean 
ingrednt = [] 
for string in sub_cosm['ingredients']:
    pattern1 = re.compile(r"\s*\([^\)]+\)") # remove parentheses and text within it 
    string = re.sub(pattern1, '', string)
    pattern2 = re.compile(r"\w+\s*\w+:\s") # remove the words like "Active Ingredients" and parentheses
    string = re.sub(pattern2, '', string)
    pattern3  = re.compile(r"\w+\.*\w*\s*%") # remove percentages 
    string = re.sub(pattern3, '', string)    
    pattern4 = re.compile(r";")   # remove semicolon 
    string = re.sub(pattern4, ', ', string)
    pattern5 = re.compile(r"\.{1}\s+")  #remove period in between words 
    string = re.sub(pattern5, ', ', string)
    pattern6 = re.compile(r"\.$")   # remove period at  the end of the ingredients list 
    string = re.sub(pattern6, '', string)
    pattern7 = re.compile(r"\\\s{1}|\/\s{1}") # remove slash signs 
    string = re.sub(pattern7, '', string) 
    pattern8 = re.compile(r"\\|\/")   
    string = re.sub(pattern8, ' ', string)
    ingrednt.append(string)

In [13]:
# Initialize dictionary, list, and initial index
ingredient_idx = {} 
corpus = []  
idx = 0

# For loop for tokenization
for i in range(len(ingrednt)):
    ingredients = ingrednt[i]      
    split_to_list = ingredients.split(',')
    tokens = [t.lower().strip(' \t\n\r') for t in  split_to_list  ] # token is a list of ingredients for each product 
    tokens = [t for t in tokens if len(t)>1 ] # keep those words whose length >1 
    corpus.append(tokens)   # corpus is a list of tokens     
    
    for ingredient in tokens:              
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx              
            idx += 1
            
#sorted (ingredient_idx)

In [14]:
# Get the number of items and tokens 
# M: number of cosmetic 
M = len(sub_cosm) 
# N: number of ingredients
N = len(ingredient_idx) 
# Initialize a matrix of zeros
A = np.zeros(shape = (M, N))

In [15]:
# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for t in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[t]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

In [16]:
# Make a document-term matrix
# with column represent each ingredient and row represent the each cosmetic
i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i +=1

In [17]:
# Dimension reduction with t-SNE
# set n_components = 2 for makeing a 2D plot later 
model = TSNE(n_components = 2, learning_rate = 200)
tsne_features = model.fit_transform(A)
# Make X, Y columns 
sub_cosm['X'] = tsne_features[:, 0]
sub_cosm['Y'] = tsne_features[:, 1]

## Part IV. Visualization 

In [18]:
from bokeh.models import ColumnDataSource, Select, HoverTool
from bokeh.models.widgets import CheckboxGroup
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.layouts import layout, widgetbox, column, row
from bokeh.models import Panel
from bokeh.models.widgets import Tabs

In [19]:
output_notebook()

In [20]:
def bkapp(doc):
     
    # Make the ColumnDataSource: source
    source = ColumnDataSource( sub_cosm ) 
    plot = figure(x_axis_label = 'T-SNE 1', y_axis_label = 'T-SNE 2',plot_height=400, plot_width=500)
    plot.circle(x= 'X', y='Y', alpha = 0.8, source = source)   
    
    # add extract information to each plot 
    hover = HoverTool(tooltips = [
        ('brand', ' @brand_name'),
        ('name', ' @product_name'),
        ('price', ' $ @price'),
        ('rank', ' @rating')])
    plot.add_tools(hover)
    
    rating_lst  = list(sub_cosm.rating.unique())
    rating_lst.sort()  
    rating_lst = [str(r) for r in rating_lst]
   
    def update(attr, old, new):
        if cat_select.value == 'Daytime Moisturizer': c = (sub_cosm.label == 'DM') 
        elif cat_select.value  == 'Eye Cream': c = (sub_cosm.label == 'EC')   
        else: c = (sub_cosm.label == 'NM')  
    
        selected_rating =  [int(checkbox_group.labels[i]) for i in 
                        checkbox_group.active]
         
        df = sub_cosm[c & (sub_cosm.rating.isin(selected_rating))]
        source.data = ColumnDataSource.from_df(df)
        
            
    cat_select = Select(options = ['Daytime Moisturizer','Nighttime Moisturizer','Eye Cream'], 
                        value='Daytime Moisturizer', 
                     title = 'Category')
    
    cat_select.on_change('value', update)
    checkbox_group = CheckboxGroup(labels= 
                                   rating_lst , active=[0,1])
    checkbox_group.on_change('active', update)
    
    
    controls = widgetbox(cat_select, checkbox_group)
    
    # Create a row layout
    layout = row(controls, plot)
    
    # Make a tab with the layout 
    tab = Panel(child=layout, title = 'Select Product Type and Ratings') 
    tabs = Tabs(tabs=[tab])
    doc.add_root(tabs)  

In [22]:
show(bkapp, notebook_url="http://localhost:8890")  # if error， change the notebook_url 