## Installation Reqs (Linux Ubuntu)
1. <code> python3 -m pip install playwright
playwright install </code>

In [132]:
## WEB SCRAPING CELL: 
from playwright.async_api import async_playwright
import asyncio

async def process_locator(locator):
    if await locator.is_visible():
        return await locator.inner_text()
    else:
        return "NA"
    

async def main():
   async with async_playwright() as pw:
       browser = await pw.chromium.launch(
           ##We'll employ the use of chromium for this webscraper
           ##Using a proxy creates HTTP errors.
          headless=False
      )

       #Beginning page: 
       page = await browser.new_page()
       await page.goto('https://world.openfoodfacts.org/')
       await page.wait_for_timeout(5000)
       result = []
       food_urls = []
       food_list = await page.query_selector_all('.list_product_a')
       for food in food_list:
           food_urls.append(await food.get_attribute('href'))
           
       for food_url in food_urls:
            food_info = {}
            await page.goto(food_url)
            #Title: 
            title = page.locator(".title-1")
            food_info['title'] = await process_locator(title)
            #Common Name:
            common_name = page.locator("#field_generic_name_value")
            food_info['common_name'] = await process_locator(common_name)
            #Quantity:
            quantity = page.locator("#field_quantity_value")
            food_info['quantity'] = await process_locator(quantity)
            #Packaging: 
            packaging = page.locator("#field_packaging_value")
            food_info['packaging'] = await process_locator(packaging)
            #Brands:
            brand = page.locator("#field_brands_value")
            food_info['brand'] = await process_locator(brand)
            #Categories:
            categories = page.locator("#field_categories_value")
            food_info['categories'] = await process_locator(categories)
            #Certifications:
            certifications = page.locator("#field_labels_value")
            food_info['certifications'] = await process_locator(certifications)
            #Origin:
            origin = page.locator("#field_origin_value")
            food_info['origin'] = await process_locator(origin)
            #origin of ingredients:
            origin_of_ingredients = page.locator("#field_origins_value")
            food_info['origin_of_ingredients'] = await process_locator(origin_of_ingredients)
            #Places of manufacturing:
            places_manufactured = page.locator("#field_manufacturing_places_value")
            food_info['places_manufactured'] = await process_locator(places_manufactured)
            #Stores:
            stores = page.locator("#field_stores_value")
            food_info['stores'] = await process_locator(stores)
            #Countries where Sold:
            countries_sold = page.locator("#field_countries_value")
            food_info['countries_sold'] = await process_locator(countries_sold)
           
            #HEALTH SECTION
            #Notice, because of the increasing complexity of the DOM elements in this area the CSS selectors don't follow a similarly nice pattern
            #Ingredients: 
            ingredients = page.locator("#panel_ingredients_content .panel_text")
            food_info['ingredients'] = await process_locator(ingredients)
            #NOVA score:
            nova_score = page.locator("ul#panel_nova li.accordion-navigation h4")
            food_info['nova_score'] = await process_locator(nova_score)
            #Palm Status:
            palm_status = page.locator(".accordion-navigation active .content panel_content active .panel_text")
            food_info['palm_status'] = await process_locator(palm_status)
            #Vegan Status:
            vegan_status = page.locator("#panel_ingredients_analysis_en-vegan_content .panel_text")
            food_info['vegan_status'] = await process_locator(vegan_status)
            #Vegetarian Status:
            vegetarian_status = page.locator("#panel_ingredients_analysis_en-vegetarian_content .panel_text")
            food_info['vegetarian_status'] = await process_locator(vegetarian_status)
            #Nutrition grade:
            nutrition_grade = page.locator(".accordion-navigation .grade_a_title")
            food_info['nutrition_grade'] = await process_locator(nutrition_grade)

            #NUTITRION FACTS
            #
            table_rows = await page.query_selector_all("#panel_nutrition_facts_table_content")
            nutrition_facts = {}
            for row in table_rows: 
                columns = await row.query_selector_all('td')
                name = await process_locator(columns[0])
                value_per_100g = await process_locator(columns[1])
                nutrition_facts[name] = {
                    "100g/100ml": value_per_100g
                }
                    
                    
            food_info['nutrition_table'] = nutrition_facts
            result.append(food_info)
            break
       
       
       

       
           
           
           
       await browser.close()
       print(result)
if __name__ == '__main__':
   await main()

#Problems & Changes:
#

#CITATIONs: 
#Code cited from OxyLabs: https://github.com/oxylabs/playwright-web-scraping?tab=readme-ov-file
#,https://playwright.dev/python/docs/locators

[{'title': 'Eau de Source - Cristaline - 1,5\xa0L', 'common_name': 'Spring water', 'quantity': '1,5 L', 'packaging': 'Aluminium-can, HdpeFilm-packet, PpFilm-wrapper, Ldpe-film', 'brand': 'Cristaline', 'categories': 'Beverages, Waters, Spring waters', 'certifications': 'Triman\n', 'origin': 'Embouteillée à 24610 Saint-Martin de Gurson France', 'origin_of_ingredients': 'France, fr:Saint-Martin de Gurson', 'places_manufactured': 'Saint-Martin de Gurson, France, 24610', 'stores': 'Carrefour, Leclerc, Auchan, Intermarché, Super U, E.Leclerc', 'countries_sold': "Belgium, Côte d'Ivoire, France, Germany, Guadeloupe, Italy, Luxembourg, Mali, Martinique, New Caledonia, Switzerland, United Kingdom", 'ingredients': 'water', 'nova_score': 'Unprocessed or minimally processed foods', 'palm_status': 'NA', 'vegan_status': 'NA', 'vegetarian_status': 'NA', 'nutrition_grade': 'Very good nutritional quality', 'nutrition_table': {'Energy': {'100g/100ml': '0 kj\n(0 kcal)'}}}]
