Fetch product ingredient data from `incidecoder.com`

In [62]:
import requests
from bs4 import BeautifulSoup
import re
import csv

# url="https://incidecoder.com/"

def fetch_webpage(search_term):

    # website search page URL
    url="https://incidecoder.com/"
    url_search = url+"search" 

    # "Benton serum" needs to look like- 'search?query=Benton+serum'
    search_term = search_term.replace(" ", "+")  
    query = {'query': search_term} 

    response = requests.get(url_search, params=query) # GET request

    if response: #check if request was successful
        print("Initial request successful for search page.")   
    else:
        raise Exception(f"Non-success status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, "html.parser") # parse the html document
    search_results = soup.find('div', class_="paddingbl") # target class for search results in this website
    # print(search_results)
    if search_results:
        search_result = search_results.find_next('a', class_="klavika simpletextlistitem") # select the first search result

    # find the relevant url portion
    match = re.search(r'href="(.*?)"', str(search_result))
    if match:
        url_ext = match.group(1)
        # print(url_ext)

    else:
        print("No match found :(")
        return None

    return url + url_ext

In [63]:
def fetch_data(search_term):

    url = fetch_webpage(search_term)

    if not url: #check if previous GET request was successful
        return
    
    response = requests.get(url)

    if response: #check if request was successful
        print("Second request successful for product page.")
        # print(response.text)
    else:
        raise Exception(f"Non-success status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, "html.parser") # parse the html document

    search_results = soup.find_all('meta') # target class for search results in this website
    search_result = search_results[2]["content"].split("explained:") # collect the ingredients list
    ingredients = search_result[1].split(', ')
    product_name = [search_result[0].rsplit(" ", 2)[0]] # collect the product name
    print("Product name: ", product_name[0])

    return product_name + ingredients
    

Here we build the dataset of product names, ingredients list and whether a product is good or bad for my skin. 

In [None]:
product_list = []
product_data = []
good_ingredients = ()
bad_ingredients = ()

def save_product_data(product_name, label):
    product_data = fetch_data(search_term=product_name) #="celimax dual")
    
    if not product_data:
        print("Please try again.")
        return

    # check if label is in correct format
    label = str.lower(label)
    print("Label: ", label)

    if label not in ['good', 'bad']:
        print('Label must be "good" or "bad".')
        return
    
    product_name = [product_data[0]]
    stored_product_names = []

    # save product names in a csv file if they aren't already recorded
    try:
        with open("product_names.csv", newline='') as names: # newline='' prevents extra blank rows
            reader = csv.reader(names, delimiter=',')
            for row in reader:
                if row:
                    stored_product_names.append(row)
            names.close()
    except FileNotFoundError:
            pass
            
    if product_name not in stored_product_names:
        with open("product_names.csv", 'a', newline='') as names: 
            writer = csv.writer(names, delimiter=',')
            writer.writerow(product_name)
            names.close()

        #write all product data in a csv file with product name, ingredients list and label in each row
        with open("product_data.csv", 'a', newline='') as data:
            writer = csv.writer(data, delimiter=',')
            writer.writerow(product_data + [label])
            data.close()

product_name = input("Please input the product name: ")
label = input("Was this product good or bad for you?: ") 
save_product_data(product_name, label)    
        


In [None]:
# separate the good/bad classes
label = str.lower(label)
good_ingredients

'dsad'

In [56]:
string = "Beauty of Joseon Beauty Of Josen Green Plum ingredients"
string.rsplit(" ",1)[0]

'Beauty of Joseon Beauty Of Josen Green Plum'

In [10]:
# print(response.text) # take a look at the response content encoded in a string


<!DOCTYPE html>
<html lang="en">
<head>
    <meta content="text/html; charset=utf-8" http-equiv="content-type">
    <meta name="viewport" content="initial-scale=1.0, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
    <title>Search results - Benton+serum</title>
    <meta name="description" content="See the search results for your query - Benton+serum." />
    <link rel="preload" as="font" href="https://incidecoder-assets.storage.googleapis.com/assets/css/fonts/klavika-regular-webfont.woff" type="font/woff" crossorigin="anonymous">
<link rel="preload" as="font" href="https://incidecoder-assets.storage.googleapis.com/assets/css/fonts/klavika-bold-webfont.woff" type="font/woff" crossorigin="anonymous"><link rel="stylesheet" href="https://incidecoder-assets.storage.googleapis.com/assets/bundles/css/all-styles.2d9aab924b7f1fd18861.css"><style nonce="cB_4u2C1wTbdiNbfxqI7ywA4XDl7FwgMRH_a022jyyE">
    @font-face {
        font-family: 'klavika_regular';
        src: url('https://inc

In [None]:
# print(response.text)

Request successful.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta content="text/html; charset=utf-8" http-equiv="content-type">
    <meta name="viewport" content="initial-scale=1.0, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
    <title>Benton Bakuchiol Serum ingredients (Explained)</title>
    <meta name="description" content="Benton Bakuchiol Serum ingredients explained: Aqua (Water), Glycerin, Caprylic/Capric Triglyceride, Bakuchiol, Niacinamide, 1,2-Hexanediol, Coco-Caprylate/Caprate, Behenyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Psoralea Corylifolia Fruit Extract, Carbomer, Arginine, Artemisia Vulgaris Oil, Xanthan Gum, Citrus Aurantium Amara (Bitter Orange) Flower Extract, Ethylhexylglycerin, Allantoin, Adenosine, Tocopherol, Lavandula Angustifolia (Lavender) Oil, Litsea Cubeba Fruit Oil, Ribes Nigrum (Black Currant) Leaf Extract">
    <link rel="preload" as="font" href="https://incidecoder-assets.storage.googleapis.com/assets/css/fonts/klavika-regular-we

In [None]:
# soup = BeautifulSoup(response.text, "html.parser") # parse the html document
# search_results = soup.find_all('meta') # class for search results in this website
# search_result = search_results[2]["content"].split("explained:")
# ingredients = list(search_result[1].split(', '))
# ingredients 

[' Aqua (Water)',
 'Glycerin',
 'Caprylic/Capric Triglyceride',
 'Bakuchiol',
 'Niacinamide',
 '1,2-Hexanediol',
 'Coco-Caprylate/Caprate',
 'Behenyl Alcohol',
 'Cetearyl Olivate',
 'Sorbitan Olivate',
 'Psoralea Corylifolia Fruit Extract',
 'Carbomer',
 'Arginine',
 'Artemisia Vulgaris Oil',
 'Xanthan Gum',
 'Citrus Aurantium Amara (Bitter Orange) Flower Extract',
 'Ethylhexylglycerin',
 'Allantoin',
 'Adenosine',
 'Tocopherol',
 'Lavandula Angustifolia (Lavender) Oil',
 'Litsea Cubeba Fruit Oil',
 'Ribes Nigrum (Black Currant) Leaf Extract']

# code run start here

Now we read and process the CSV data we stored

In [1]:
import csv

products = {}

with open("product_data.csv", 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile) #created a csv.reader object

    for row in csv_reader:
        if not row:
            continue
        products[row[0]] = row[1:]

In [2]:
products

{'Celimax Dual Barrier Wearable Cream': [' Water',
  'Glycerin',
  'Caprylic/Capric Triglyceride',
  'Butylene Glycol',
  'Propanediol',
  'Stearic Acid',
  'Cetearyl Alcohol',
  'Methyl Trimethicone',
  'Caprylyl Methicone',
  '1,2-Hexanediol',
  'C12-16 Alcohols',
  'Panthenol',
  'Olea Europaea (Olive) Fruit Oil',
  'Dipropylene Glycol',
  'Allantoin',
  'Pantolactone',
  'Malt Extract',
  'Ceramide EOP',
  'Ceramide NS',
  'Ceramide NP',
  'Ceramide AS',
  'Ceramide AP',
  'Sodium Hyaluronate',
  'Heptasodium Hexacarboxymethyl Dipeptide-12',
  'Phytosphingosine',
  'Arachidic Acid',
  'Oleic Acid',
  'Palmitic Acid',
  'Glyceryl Stearate SE',
  'Hydrogenated Lecithin',
  'Xanthan Gum',
  'Carbomer',
  'Glucose',
  'Cetearyl Glucoside',
  'Ethylhexylglycerin',
  'Cholesterol',
  'Tocopherol',
  'Tromethamine',
  'Disodium EDTA',
  'good'],
 'Anua Heartleaf 77% Soothing Toner': [' Houttuynia Cordata Extract (77%)',
  'Water',
  '1,2-Hexanediol',
  'Glycerin',
  'Betaine',
  'Pantheno

In [15]:
good_products = [] 
bad_products = []
all_good_ingredients = []
all_bad_ingredients = []
all_good_ingredients_set = set()
all_bad_ingredients_set = set()
    

for key,val in products.items():
    products[key] = [str.lower(item) for item in val] # make product name lowercase

for key,val in products.items():
    if val[-1] == "good":
        good_products.append([key, val[:-1]])
        all_good_ingredients.extend(val[:-1])
        all_good_ingredients_set.update(val[:-1])
        all_good_ingredients_set.discard("1") # remove without raising error if it doesn't exist
   
    else:
        bad_products.append([key, val[:-1]])
        all_bad_ingredients.extend(val[:-1])
        all_bad_ingredients_set.update(val[:-1])
        all_bad_ingredients_set.discard("1") # remove without raising error if it doesn't exist



print("Number of products in this dataset:", len(products))
print("Number of good products:", len(good_products))
print("Number of bad products:", len(bad_products))
print("Number of all good ingredients:", len(all_good_ingredients_set))
print("Number of all bad ingredients:", len(all_bad_ingredients_set))

neutral_ingredients_set = all_bad_ingredients_set & all_good_ingredients_set # intersection
all_ingredients_set = all_bad_ingredients_set | all_good_ingredients_set # union

print("Total number of ingredients:", len(all_ingredients_set))
print("Number of neutral ingredients:", len(neutral_ingredients_set))
print("Number of exclusively bad ingredients:", len(all_bad_ingredients_set - neutral_ingredients_set))
print("Number of exclusively good ingredients:", len(all_good_ingredients_set - neutral_ingredients_set))



Number of products in this dataset: 107
Number of good products: 46
Number of bad products: 61
Number of all good ingredients: 584
Number of all bad ingredients: 758
Total number of ingredients: 1050
Number of neutral ingredients: 292
Number of exclusively bad ingredients: 466
Number of exclusively good ingredients: 292


# Data Cleaning

Check for duplicate products in the products list (not included in the app because new entries are first checked in the records to ensure duplicates are not added.)

In [5]:
product_names = []

with open("product_names.csv", 'r', newline='') as f:
    csv_reader = csv.reader(f)
    for row in csv_reader:
        product_names.append(row[0])

duplicates = []
for item in product_names:
    if product_names.count(item) > 1 and item not in duplicates:
        duplicates.append(item)

print(duplicates)

[]


Check for line breaks in products data csv file. Sometimes there is a new line due to formatting.

In [6]:
import csv

last_checked_line = 102

with open("product_names.csv", 'r', newline='') as f:
    csv_reader = csv.reader(f)
    names_count = sum(1 for row in csv_reader)

with open("product_data.csv", 'r', newline='') as f:
    csv_reader = csv.reader(f)
    data_count = sum(1 for row in csv_reader)

if names_count == data_count:
    print("No new line breaks!")
else:
    print("Check for a line break after line", last_checked_line,
          "in the product_data.csv file.")


No new line breaks!


Clean up ingredients list for repeats/ multiple formats for the same ingredients etc. There is no universal naming convention for ingredients so especially the extracts can have different formats.

In [7]:
all_ingredients_list = sorted(all_ingredients_set)
# all_ingredients_list.remove("1") #line 51
all_ingredients_list

[' *aloe barbadensis leaf extract',
 ' aloe barbadensis leaf extract',
 ' aloe barbadensis leaf water',
 ' anthemis nobilis flower extract',
 ' aqua',
 ' aqua (water)',
 ' aqua(water)',
 ' ascorbic acid (active vitamin c)',
 ' bifida ferment extract (100%)',
 ' bifida ferment lysate',
 ' bifida ferment lysate (42%)',
 ' centella asiatica extract (50.47%)',
 ' cetyl ethylhexanoate',
 ' citrus unshiu fruit extract',
 ' ethylhexyl palmitate',
 ' ficus carica (fig) fruit extract',
 ' galactomyces ferment filtrate',
 ' galactomyces ferment filtrate (60%)',
 ' glycine soja (soybean) oil',
 ' green tea extract',
 ' hippophae rhamnoides (sea buckthorn) fruit extract (86.14%)',
 ' houttuynia cordata extract',
 ' houttuynia cordata extract (77%)',
 ' morinda citrifolia fruit extract (57%)',
 ' morinda citrifolia fruit extract (80.1%)',
 ' oryza sativa (rice) bran water',
 ' oryza sativa (rice) bran water(68.6 %)',
 ' panax ginseng root extract',
 ' panax ginseng root water',
 ' pollen extract',


In [8]:
# the previous output only dsiaplays 1000 rows
all_ingredients_list[1000:]

['tremella fuciformis extract',
 'trideceth-10',
 'trideceth-6',
 'triethanolamine',
 'triethylhexanoin',
 'tripeptide-1',
 'tripeptide-2',
 'tripeptide-3',
 'trisodium edta',
 'trisodium ethylenediamine disuccinate',
 'triticum vulgare (wheat) seed extract',
 'triticum vulgare sprout extract',
 'tromethamine',
 'tropolone',
 'tuber magnatum extract',
 'turmeric extract',
 'tyrosine',
 'ubiquinone',
 'ulmus davidiana root extract',
 'undaria pinnatifida extract',
 'usnea barbata (lichen) extract',
 'vaccinium angustifolium (blueberry) fruit extract',
 'vaccinium angustifolium (blueberry) fruit extract (21%)',
 'vaccinium macrocarpon (cranberry) fruit extract',
 'vaccinium myrtillus fruit extract',
 'valine',
 'vegetable oil',
 'vigna radiata seed extract',
 'vinyl dimethicone',
 'vitamin c tetraisopalmitate',
 'vitamin e',
 'vitex agnus castus extract',
 'vitis vinifera (grape) fruit extract',
 'vitis vinifera (grape) seed oil',
 'vp/eicosene copolymer',
 'water',
 'white willow bark e

In [9]:
# replace the repeated ingredients listed/spelled differently
# format {'ingredient to be replaced': 'ingredient to replace'}

replace_dict = {' *aloe barbadensis leaf extract': 'aloe barbadensis leaf extract',
                ' aloe barbadensis leaf extract': 'aloe barbadensis leaf extract',
                ' aqua (water)': 'water',
                ' water': 'water',
                ' aqua(water)': 'water',
                'aqua': ' water',
                ' water (aqua)': 'water',
                ' water(aqua/eau)': 'water',
                ' water/aqua/eau': 'water',
                ' tremella mushroom extract': 'tremella fuciformis (mushroom) extract',
                'tremella fuciformis extract': 'tremella fuciformis (mushroom) extract',
                ' tremella fuciformis (mushroom) extract': 'tremella fuciformis (mushroom) extract',
                'anona cherimolia fruit extract': 'anthemis nobilis (chamomile) flower oil',
                'aqua (water)': 'water',
                'aqua(water)': 'water',
                'aronia melanocarpa Fruit Extract': 'aronia melanocarpa (black chokeberry) fruit extract',
                'bees wax': 'beeswax',
                'brassica oleracea italica extract': 'brassica oleracea italica (broccoli) extract',
                'camellia sinensis leaf extract': 'camellia sinensis (green tea) leaf extract',
                '*camellia sinensis leaf ext': 'camellia sinensis (green tea) leaf extract',
                'cucumis sativus fruit extract': 'cucumis sativus (cucumber) fruit extract',
                'curcuma longa turmeric root extract': 'curcuma longa (turmeric) root extract',
                'euterpe oleracea fruit extract': 'euterpe oleracea (acai) fruit extract',
                'fragrance(parfum)': 'fragrance (parfum)',
                'fragrance': 'fragrance (parfum)',
                'fragrance/parfum': 'fragrance (parfum)',
                'ginkgo biloba leaf extract': 'ginkgo biloba (ginkgo) leaf extract',
                'glycyrrhiza glabra root extract': 'glycyrrhiza glabra (licorice) root extract',
                'honey / mel / miel': 'honey',
                'hydrogenated poly(c6-14 olefin)': 'hydrogenated poly (c6-14 olefin)',
                'morus alba fruit extract': 'morus alba (white mulberry) fruit extract',
                'olea europaea fruit oil': 'olea europaea (olive) fruit oil',
                'oryza sativa bran oil': 'oryza sativa (rice) bran oil',
                'pelargonium graveolens flower oil': 'pelargonium graveolens (geranium) flower oil',
                'polygonum cuspidatum root extract': 'polygonum cuspidatum (japanese knotweed) root extract',
                'rosa damascena flower oil': 'rosa damascena (rose) flower oil',
                'rosmarinus officinalis leaf extract': 'rosmarinus officinalis (rosemary) extract',
                'saccharum officinarum (sugar cane) extract': 'saccharum officinarum (sugarcane) extract',
                'sugar cane extract': 'saccharum officinarum (sugarcane) extract',
                'sambucus nigra fruit extract': 'sambucus nigra (elder) fruit extract',
                'schisandra chinensis fruit extract': 'schisandra chinensis (schizandra berry) fruit extract',
                'scutellaria baicalensis root extract': 'scutellaria baicalensis (baikal skullcap) root extract',
                'simmondsia chinensis seed oil': 'simmondsia chinensis (jojoba) seed oil',
                'sunflower seed oil': 'helianthus annuus (sunflower) seed oil',
                'strawberry fruit extract': 'fragaria chiloensis (strawberry) fruit extract',
                'tea tree leaf oil': 'melaleuca alternifolia (tea tree) leaf oil',
                'theobroma cacao(cocoa) extract': 'theobroma cacao (cocoa) extract',
                'tocopherol': 'tocopherol (vitamin e)',
                'vitamin e': 'tocopherol (vitamin e)',
                ' bifida ferment lysate': 'bifida ferment lysate',
                ' cetyl ethylhexanoate': 'cetyl ethylhexanoate',
                ' houttuynia cordata extract': 'houttuynia cordata extract',
                ' panax ginseng root extract': 'panax ginseng root extract',
                ' propolis extract': 'propolis extract',
                ' snail secretion filtrate': 'snail secretion filtrate',
                '[hyaluronic acid]': 'hyaluronic acid',
                }

def remove_duplicates_set(ingredients_set, replace_dict):

    temp = []
    for item in ingredients_set:
        if item in replace_dict.keys():
            temp.append(replace_dict[item])
            # print(item)
    
    # print("length before: ", len(ingredients_set))
    # print("temp length: ", len(temp))

    ingredients_set.update(temp)
    ingredients_set = ingredients_set - set(replace_dict.keys())
    # print("length after: ", len(ingredients_set))
    return ingredients_set
    

In [10]:
all_bad_ingredients_set = remove_duplicates_set(all_bad_ingredients_set, replace_dict)
all_good_ingredients_set = remove_duplicates_set(all_good_ingredients_set, replace_dict)
all_ingredients_set = remove_duplicates_set(all_ingredients_set, replace_dict)

print("Updated all bad ingredients number: ", len(all_bad_ingredients_set))
print("Updated all good ingredients number: ", len(all_good_ingredients_set))
print("Updated all ingredients number: ", len(all_ingredients_set))

Updated all bad ingredients number:  729
Updated all good ingredients number:  565
Updated all ingredients number:  996


In [11]:
all_bad_ingredients_set

{' anthemis nobilis flower extract',
 ' aqua',
 ' ascorbic acid (active vitamin c)',
 ' bifida ferment lysate (42%)',
 ' centella asiatica extract (50.47%)',
 ' citrus unshiu fruit extract',
 ' ethylhexyl palmitate',
 ' ficus carica (fig) fruit extract',
 ' galactomyces ferment filtrate',
 ' glycine soja (soybean) oil',
 ' green tea extract',
 ' hippophae rhamnoides (sea buckthorn) fruit extract (86.14%)',
 ' morinda citrifolia fruit extract (57%)',
 ' morinda citrifolia fruit extract (80.1%)',
 ' pollen extract',
 ' purified water',
 ' rice extract',
 ' rubus fruticosus (blackberry) fruit extract (21%)',
 ' saccharomyces/xylinum/black tea ferment (60%)',
 ' salix alba (willow) bark water',
 ' vaccinium vitis-idaea fruit extract',
 '*momordica charantia fruit ext',
 '*sambucus nigra flower ext',
 '1,2-hexanediol',
 '1,3-propanediol',
 '2,3-butanediol',
 '2-hexanediol',
 '3-0-ethyl ascorbic acid',
 '3-methyl-1,3-butanediol',
 '3-o-ethyl ascorbic acid',
 'abies sibirica oil',
 'absolute 

In [None]:
from collections import Counter

print("Total number of good ingredients: ", len(all_good_ingredients))
print("Total number of bad ingredients: ", len(all_bad_ingredients))

all_good_ingredients_new = [replace_dict[ing] if ing in replace_dict.keys() else ing for ing in all_good_ingredients]
all_bad_ingredients_new = [replace_dict[ing] if ing in replace_dict.keys() else ing for ing in all_bad_ingredients]

good_ingredients_frequency = Counter(all_good_ingredients_new)
bad_ingredients_frequency = Counter(all_bad_ingredients_new)

Total number of good ingredients:  1457
Total number of bad ingredients:  2269


In [21]:
all_good_ingredients[0:10], all_good_ingredients_new[0:10]

([' water',
  'glycerin',
  'caprylic/capric triglyceride',
  'butylene glycol',
  'propanediol',
  'stearic acid',
  'cetearyl alcohol',
  'methyl trimethicone',
  'caprylyl methicone',
  '1,2-hexanediol'],
 ['water',
  'glycerin',
  'caprylic/capric triglyceride',
  'butylene glycol',
  'propanediol',
  'stearic acid',
  'cetearyl alcohol',
  'methyl trimethicone',
  'caprylyl methicone',
  '1,2-hexanediol'])