## Notebook Content

In this notebook, we will scrape the [__Sprouts__](https://shop.sprouts.com/shop) which is an online grocery website, in order to store the food data and use it to detect calories based on nutrition.

## Import Libraries

In [34]:
#import needed libraries
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
from flask import Response
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

## WebDriver Initialization

In [22]:
#Amjad Path
path = "/Users/amjad/Downloads/chromedriver"

In [35]:
#Shahad Path
path = "/Users/shahadsulaiman/Downloads/chromedriver"

In [36]:
chromedriver = path # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

## Convert String to Float

In [1]:
def returnFloat(str_):
    """
    This method will be called to convert the string provided in the sent str to float. This method working by check 
    the text and then return only the numbers that are in that str.
    """
    if str_ == '':
        return np.nan
    res_text = ''
    for char in str_:
        if char.isdigit() or char == '.':
            res_text += char
    if res_text == '':
        return 0
    return float(res_text)

## Store Product Info

In [2]:
def perProduct(table, productURL):
    """
    This method will be called to store all the data scraped in the provided nutrition table, it will store and return 
    the data on a dictionry.
    """ 

    if table == None:
        return {}
   # print("Taablllle -------",table)
    dic = {}
    dic['URL']= productURL
    dic["CALORY"]= int(table.find('div', class_= 'css-ozeacm').text.strip())
    lis = table.find_all('li', class_='css-te1jiu')
    for li in lis:
        spans = table.find_all('span')
        for i in range(2, int(len(spans)),2):
            dic[spans[i].text.strip().replace(" ", '_').upper()] = []
            dic[spans[i].text.strip().replace(" ", '_').upper()].append(returnFloat(spans[i+1].text.strip().replace(" ", '')))
    return dic

## Scraping Methods

In [3]:
def perPage(url, lastPage):
    """
    This method will be called to scrape each cayegory, after find the data of each product it will call the
    perProduct() method to store all the product data and return the results in a dictionry, and finally this 
    method will convert and return the results in a dataframe.
    """
    Sprouts_df = pd.DataFrame({})
    Myurl = ''
    for i in range(1,lastPage+1):
        flag = True
        print('i.  ===> ',i)
        print('Category     ===> ', url[-9:-6])
        Myurl = f'{url}{i}'      
        driver = webdriver.Chrome(chromedriver)
        driver.get(Myurl)
        timeout = 30  
        try:
            element_present = EC.presence_of_element_located((By.ID, 'shopping-selector-parent-process-modal-close-click'))
            WebDriverWait(driver, 40).until(element_present)
        except :
            print("Timed out waiting for page to load")
            flag = False
        else:
            print("Page loaded")
        if flag:
            try:
                driver.find_element_by_xpath("//button[@id='shopping-selector-parent-process-modal-close-click']").click()
                element_present = EC.presence_of_element_located((By.CLASS_NAME, 'react-cell'))
                WebDriverWait(driver, timeout).until(element_present)
            except :
                print("Timed out waiting for page to load")
                flag = False
            if flag: 
                try:
                    products = driver.find_elements_by_xpath("//div[@class='react-cell']")
                except:
                    print("ERROR OCCOURED")
                for product in products:
                    flag2 = True
                    try:
                        product.click()
                        element_present = EC.presence_of_element_located((By.ID, 'tab-nutrition'))
                        WebDriverWait(driver, timeout).until(element_present)
                    except :
                        print("Timed out waiting for page to load")
                        driver.find_element_by_xpath("//button[@class='close']").click()
                        flag2 = False
                    if flag2:
                        try:
                            driver.find_element_by_xpath("//button[@id='tab-nutrition']").click()
                            element_present = EC.presence_of_element_located((By.CLASS_NAME, 'css-css4z'))
                            WebDriverWait(driver, timeout).until(element_present)
                        except :
                            print("Timed out waiting for page to load")
                            driver.find_element_by_xpath("//button[@class='close']").click()
                            flag2 = False
                        if flag2:
                            try:
                                page = driver.page_source
                                soup = BeautifulSoup(page, 'html5lib')
                                table = soup.find('div', {"class" : 'css-css4z'})
                                tmp = pd.DataFrame(perProduct(table, Myurl))
                                Sprouts_df = pd.concat([Sprouts_df,tmp])
                                Sprouts_df.to_csv(f'Sprouts_PerCategory_df_{url[-9:-6]}.csv')
                                driver.find_element_by_xpath("//button[@class='close']").click()
                            except:
                                print("ERROR OCCOURED")
        driver.quit()
    return Sprouts_df

In [42]:
def perCategory():
    """
    This function contain a list of all categories, we will simply call the perPage() method and provide it with 
    the category url as well as the number of pages per category to scrape it and return the results of each category
    in the dataframe, and finally this method will store all the results in a dataframe. 
    """
    Sprouts_All_df = pd.DataFrame({})
    CategoryList = ['https://shop.sprouts.com/shop/categories/133?page=' 
                    ,'https://shop.sprouts.com/shop/categories/141?page='
                    ,'https://shop.sprouts.com/shop/categories/153?page=']
    lastPages = [14,5,7]
    for i in range(0,len(lastPages)):
        tmp = perPage(CategoryList[i],lastPages[i])
        Sprouts_All_df = pd.concat([Sprouts_All_df,tmp])
        Sprouts_All_df.to_csv(f'Sprouts_All_df.csv')
        

In [43]:
perCategory()

i.  ===>  1
Category     ===>  133
Page loaded
i.  ===>  2
Category     ===>  133
Page loaded
i.  ===>  3
Category     ===>  133
Page loaded
Timed out waiting for page to load
i.  ===>  4
Category     ===>  133
Page loaded
i.  ===>  5
Category     ===>  133
Page loaded
Timed out waiting for page to load
Timed out waiting for page to load
i.  ===>  6
Category     ===>  133
Page loaded
i.  ===>  7
Category     ===>  133
Page loaded
Timed out waiting for page to load
Timed out waiting for page to load
Timed out waiting for page to load
i.  ===>  8
Category     ===>  133
Page loaded
Timed out waiting for page to load
Timed out waiting for page to load
Timed out waiting for page to load
i.  ===>  9
Category     ===>  133
Page loaded
Timed out waiting for page to load
i.  ===>  10
Category     ===>  133
Page loaded
i.  ===>  11
Category     ===>  133
Page loaded
Timed out waiting for page to load
i.  ===>  12
Category     ===>  133
Page loaded
Timed out waiting for page to load
Timed out wai