## Notebook Content

In this notebook, we will scrape the [__Trader Joe's__](https://www.traderjoes.com/home) which is an online grocery website, in order to store the food data and use it to detect calories based on nutrition.

## Import Libraries

In [5]:
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np

## WebDriver Initialization

In [6]:
#Amjad Path
path = "/Users/amjad/Downloads/chromedriver"

In [7]:
#Shahad Path
path = "/Users/shahadsulaiman/Downloads/chromedriver"

In [8]:
chromedriver = path # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

## Convert String to Float

In [12]:
def returnFloat(str_):
    """
    This method will be called to convert the string provided in the sent str to float. This method working by check 
    the text and then return only the numbers that are in that str.
    """
    if str_ == '':
        return np.nan
    res_text = ''
    for char in str_:
        if char.isdigit() or char == '.':
            res_text += char
    if res_text == '':
        return 0
    return float(res_text)
        

## Scraping Methods

In [10]:
def perPage(url):
    
    """
    This method will be called to scrape the data provided in the sent url, and then it will store the data in the
    dictionry and it will return this dictionry. Each key in the dictionary will represent the nutritional element 
    name and the key value represents the amount of that nutritional element.
    """
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    dic = {}
    timeout = 20
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'Item_table__cell__aUMvf'))
        WebDriverWait(driver, timeout).until(element_present)
    except :
        print("Timed out waiting for page to load")
        driver.quit()
        return dic
    finally:
        print("Page loaded")
        
        
    try: 
        page = driver.page_source
        driver.quit()
        soup = BeautifulSoup(page, 'html5lib')
        dic['URL'] = url
        dic['CALORY'] = int(soup.find_all('div', class_='Item_characteristics__text__dcfEC')[1].text.strip())
        trs = soup.find_all('tr', class_='Item_table__row__3Wdx2')
        for tr in trs:
            tds = tr.find_all('td', class_='Item_table__cell__aUMvf')
            if len(tds) > 2:
                if tds[0].text.strip().upper().replace(' ','_') not in dic.keys():
                    dic[tds[0].text.strip().upper().replace(' ','_')] = []
                    dic[tds[0].text.strip().upper().replace(' ','_')].append(returnFloat(tds[1].text))
    except Exception as e:
        print("ERROR Is.   ==>",e)
        print("Error Occurred")
        driver.quit()
        return {}
    
    return dic


In [5]:
def allFood():
    """
    This method will be called to scrape the entire products provided in the website, it will check with each 
    product and then call the perPage() method and provide it with the product url to scrape it. At the end,
    this method will store all the data in the dataframe.
    """
    All_df = pd.DataFrame({})
    for i in range(1,31):
        flag = True
        print('i.  ===> ',i)
        if i == 1 :
            url = 'https://www.traderjoes.com/home/products/category?categoryId=8'
        else:
            url = f'https://www.traderjoes.com/home/products/category?categoryId=8&filters=%7B%22page%22%3A{i}%7D'
        driver = webdriver.Chrome(chromedriver)
        driver.get(url)
        timeout = 20
        try:
            element_present = EC.presence_of_element_located((By.CLASS_NAME, 'ProductCard_card__title__301JH'))
            WebDriverWait(driver, timeout).until(element_present)
        except TimeoutException:
            print("Timed out waiting for page to load")
            flag = False
            driver.quit()
        finally:
            print("Page loaded")
        if flag: 
            try:
                page = driver.page_source
                driver.quit()
                soup = BeautifulSoup(page, 'html5lib')
                As = soup.find_all('a', class_= "ProductCard_card__title__301JH")
                for a in As:
                    link = a.get("href")
                    fullLink = 'https://www.traderjoes.com' + link
                    tmp = pd.DataFrame(perPage(fullLink))
                    All_df = pd.concat([All_df, tmp])
                    All_df.to_csv('TraderJoes_df.csv')
            except:
                print("Error Occurred")
                driver.quit()
                
    return All_df

In [6]:
#calling ALLFood() method to start scraping
allFood()

i.  ===>  1
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Timed out waiting for page to load
Page loaded
Page loaded
Page loaded
i.  ===>  2
Page loaded
Page loaded
Page loaded
Page loaded
Timed out waiting for page to load
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
i.  ===>  3
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Timed out waiting for page to load
Page loaded
Timed out waiting for page to load
Page loaded
Timed out waiting for page to load
Page loaded
Timed out waiting for page to load
Page loaded
Timed out waiting for page to load
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
i.  ===>  4
Page loaded
Timed out waiting for page to load
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded
Page loaded


Unnamed: 0,URL,CALORY,TOTAL_FAT,SATURATED_FAT,TRANS_FAT,CHOLESTEROL,SODIUM,TOTAL_CARBOHYDRATE,DIETARY_FIBER,TOTAL_SUGARS,...,SUGARS_1G,SODUM,INCLUES,CALORIES_FROM_FAT,ADDED_SUGARSƗ,Ɨ_ONE__SERVING_ADDS_16G_OF_SUGAR_TO_YOUR_DIET_AND_REPRESENTS__32%_OF_THE_DAILY_VALUE_FOR_ADDED_SUGARS,SAT._FAT,TOTAL_CARB.,VIT._D,POTAS.
0,https://www.traderjoes.com/home/products/pdp/0...,160,5.0,0.0,0.0,20.0,370.0,24.0,0.0,5.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,120,2.0,0.0,0.0,0.0,420.0,20.0,0.0,2.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,70,4.5,2.5,0.0,10.0,600.0,8.0,0.0,1.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,170,10.0,7.0,0.0,20.0,220.0,18.0,1.0,1.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,120,8.0,4.5,0.0,25.0,150.0,11.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,https://www.traderjoes.com/home/products/pdp/0...,80,0.0,0.0,0.0,0.0,220.0,17.0,1.0,1.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,170,17.0,11.0,0.0,0.0,520.0,6.0,1.0,1.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,500,28.0,8.0,0.0,125.0,670.0,22.0,2.0,26.0,...,,,,,,,,,,
0,https://www.traderjoes.com/home/products/pdp/0...,120,7.0,0.5,0.0,0.0,25.0,11.0,1.0,5.0,...,,,,,,,,,,
