In [62]:
import requests
from bs4 import BeautifulSoup
import re
import csv

# url="https://incidecoder.com/"

def fetch_webpage(search_term):

    # website search page URL
    url="https://incidecoder.com/"
    url_search = url+"search" 

    # "Benton serum" needs to look like- 'search?query=Benton+serum'
    search_term = search_term.replace(" ", "+")  
    query = {'query': search_term} 

    response = requests.get(url_search, params=query) # GET request

    if response: #check if request was successful
        print("Initial request successful for search page.")   
    else:
        raise Exception(f"Non-success status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, "html.parser") # parse the html document
    search_results = soup.find('div', class_="paddingbl") # target class for search results in this website
    # print(search_results)
    if search_results:
        search_result = search_results.find_next('a', class_="klavika simpletextlistitem") # select the first search result

    # find the relevant url portion
    match = re.search(r'href="(.*?)"', str(search_result))
    if match:
        url_ext = match.group(1)
        # print(url_ext)

    else:
        print("No match found :(")
        return None

    return url + url_ext

In [63]:
def fetch_data(search_term):

    url = fetch_webpage(search_term)

    if not url: #check if previous GET request was successful
        return
    
    response = requests.get(url)

    if response: #check if request was successful
        print("Second request successful for product page.")
        # print(response.text)
    else:
        raise Exception(f"Non-success status code: {response.status_code}")
    
    soup = BeautifulSoup(response.text, "html.parser") # parse the html document

    search_results = soup.find_all('meta') # target class for search results in this website
    search_result = search_results[2]["content"].split("explained:") # collect the ingredients list
    ingredients = search_result[1].split(', ')
    product_name = [search_result[0].rsplit(" ", 2)[0]] # collect the product name
    print("Product name: ", product_name[0])

    return product_name + ingredients
    

Here we build the dataset of product name, ingredients list and whether it is good or bad for my skin. 

In [None]:
product_list = []
product_data = []
good_ingredients = ()
bad_ingredients = ()

def save_product_data(product_name, label):
    product_data = fetch_data(search_term=product_name) #="celimax dual")
    
    if not product_data:
        print("Please try again.")
        return

    # check if label is in correct format
    label = str.lower(label)
    print("Label: ", label)

    if label not in ['good', 'bad']:
        print('Label must be "good" or "bad".')
        return
    
    product_name = [product_data[0]]
    stored_product_names = []

    # save product names in a csv file if they aren't already recorded
    try:
        with open("product_names.csv", newline='') as names: # newline='' prevents extra blank rows
            reader = csv.reader(names, delimiter=',')
            for row in reader:
                if row:
                    stored_product_names.append(row)
            names.close()
    except FileNotFoundError:
            pass
            
    if product_name not in stored_product_names:
        with open("product_names.csv", 'a', newline='') as names: 
            writer = csv.writer(names, delimiter=',')
            writer.writerow(product_name)
            names.close()

        #write all product data in a csv file with product name, ingredients list and label in each row
        with open("product_data.csv", 'a', newline='') as data:
            writer = csv.writer(data, delimiter=',')
            writer.writerow(product_data + [label])
            data.close()

product_name = input("Please input the product name: ")
label = input("Was this product good or bad for you?: ") 
save_product_data(product_name, label)    
        


Initial request successful for search page.
Second request successful for product page.
Product name:  PAPA RECIPE Blemish Cream
Label:  good


In [None]:
# separate the good/bad classes
label = str.lower(label)
good_ingredients

'dsad'

In [56]:
string = "Beauty of Joseon Beauty Of Josen Green Plum ingredients"
string.rsplit(" ",1)[0]

'Beauty of Joseon Beauty Of Josen Green Plum'

In [10]:
# print(response.text) # take a look at the response content encoded in a string


<!DOCTYPE html>
<html lang="en">
<head>
    <meta content="text/html; charset=utf-8" http-equiv="content-type">
    <meta name="viewport" content="initial-scale=1.0, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
    <title>Search results - Benton+serum</title>
    <meta name="description" content="See the search results for your query - Benton+serum." />
    <link rel="preload" as="font" href="https://incidecoder-assets.storage.googleapis.com/assets/css/fonts/klavika-regular-webfont.woff" type="font/woff" crossorigin="anonymous">
<link rel="preload" as="font" href="https://incidecoder-assets.storage.googleapis.com/assets/css/fonts/klavika-bold-webfont.woff" type="font/woff" crossorigin="anonymous"><link rel="stylesheet" href="https://incidecoder-assets.storage.googleapis.com/assets/bundles/css/all-styles.2d9aab924b7f1fd18861.css"><style nonce="cB_4u2C1wTbdiNbfxqI7ywA4XDl7FwgMRH_a022jyyE">
    @font-face {
        font-family: 'klavika_regular';
        src: url('https://inc

In [None]:
# print(response.text)

Request successful.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta content="text/html; charset=utf-8" http-equiv="content-type">
    <meta name="viewport" content="initial-scale=1.0, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
    <title>Benton Bakuchiol Serum ingredients (Explained)</title>
    <meta name="description" content="Benton Bakuchiol Serum ingredients explained: Aqua (Water), Glycerin, Caprylic/Capric Triglyceride, Bakuchiol, Niacinamide, 1,2-Hexanediol, Coco-Caprylate/Caprate, Behenyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Psoralea Corylifolia Fruit Extract, Carbomer, Arginine, Artemisia Vulgaris Oil, Xanthan Gum, Citrus Aurantium Amara (Bitter Orange) Flower Extract, Ethylhexylglycerin, Allantoin, Adenosine, Tocopherol, Lavandula Angustifolia (Lavender) Oil, Litsea Cubeba Fruit Oil, Ribes Nigrum (Black Currant) Leaf Extract">
    <link rel="preload" as="font" href="https://incidecoder-assets.storage.googleapis.com/assets/css/fonts/klavika-regular-we

In [None]:
# soup = BeautifulSoup(response.text, "html.parser") # parse the html document
# search_results = soup.find_all('meta') # class for search results in this website
# search_result = search_results[2]["content"].split("explained:")
# ingredients = list(search_result[1].split(', '))
# ingredients 

[' Aqua (Water)',
 'Glycerin',
 'Caprylic/Capric Triglyceride',
 'Bakuchiol',
 'Niacinamide',
 '1,2-Hexanediol',
 'Coco-Caprylate/Caprate',
 'Behenyl Alcohol',
 'Cetearyl Olivate',
 'Sorbitan Olivate',
 'Psoralea Corylifolia Fruit Extract',
 'Carbomer',
 'Arginine',
 'Artemisia Vulgaris Oil',
 'Xanthan Gum',
 'Citrus Aurantium Amara (Bitter Orange) Flower Extract',
 'Ethylhexylglycerin',
 'Allantoin',
 'Adenosine',
 'Tocopherol',
 'Lavandula Angustifolia (Lavender) Oil',
 'Litsea Cubeba Fruit Oil',
 'Ribes Nigrum (Black Currant) Leaf Extract']