# Beer scraper - Zé Delivery

This project scrapes beer prices and other info from the delivery website "Zé Delivery", and returns the cheapest option possible, subject to constraints defined by the user.

# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from unidecode import unidecode

import pandas as pd
import numpy as np
import re

# Auxiliary functions

In [2]:
# This function creates the url string for a particular brand
def get_url(brand):
    root_url = 'https://www.ze.delivery/produtos/marca/'
    brand = brand.lower()
    brand = unidecode(brand)
    brand = brand.replace("'", "")
    brand = brand.replace(" ", "-")
    return root_url+brand

In [3]:
# This function takes the price in the format that it is found in the HTML, and convert to a float.
def handle_price(price):
    price = price.text[3:]
    price = price.replace(',','.')
    return float(price)

In [4]:
# This function get the number of mls in a product
def get_mls(product):
    pattern1 = r"(\d+)ml"
    pattern2 = r"(\d*\.*\d+)L"
    try:
        ml = float(re.findall(pattern1, product)[0])
    except:
        try:
            ml = int(float(re.findall(pattern2, product)[0])*1000)
        except:
            ml = np.nan
    return ml

In [5]:
# This function checks if the product is returnable, by looking at its name
def is_returnable(product):
    if product.find('Apenas o ') == -1:
        return 'No'
    return 'Yes'

# Class

In [6]:
class BeerScraper:
    def __init__(self):
        self.driver = None
        self.email = None
        self.password = None
        self.available_brands = []
        self.prices = []
        self.products = []
        self.brands = []
        self.df = None
        # Filters
        self.wanted_brands = []
        self.unwanted_brands = []
        self.returnable = ['Yes','No']
        self.max_mls = 99999
        self.filtered_df = None
        
    def build_driver(self):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=options)
    
    def login(self):
        # Login details
        login_url = 'https://www.ze.delivery/conta/entrar'
        self.email = "brunoprates@poli.ufrj.br" ############### Mudar depois
        self.password = "ze123456"
        
        # Enter login details in form
        self.driver.get(login_url)
        self.driver.implicitly_wait(6)
        self.driver.find_element_by_xpath("""//*[@id="login-mail-input-email"]""").send_keys(self.email)
        self.driver.find_element_by_xpath("""//*[@id="login-mail-input-password"]""").send_keys(self.password)

        # Press sign in button
        button = self.driver.find_element_by_xpath("""//*[@id="login-mail-button-sign-in"]""")
        self.driver.execute_script("arguments[0].click();", button)
        time.sleep(3) # Wait a couple seconds to complete the sign in
        
    def get_available_brands(self):
        url_brands = 'https://www.ze.delivery/produtos/categoria/cervejas'
        self.driver.get(url_brands)
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        available_brands_html = soup.find_all("h2", class_="css-l9heuk-shelfTitle")
        self.available_brands = [brand_html.text for brand_html in available_brands_html]
        
    def scrape_data(self):
        for brand in self.available_brands:
            #Get page HTML
            url = get_url(brand)
            self.driver.get(url)
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            
            #Find products and add to instance variable
            products_html = soup.find_all("h3", class_="css-krg860-productTitle")
            for product in products_html:
                self.products.append(product.text)
            
            #Find prices and add to instance variable.
            prices_html = soup.find_all("div", class_="css-t89dhz-priceText")
            for price in prices_html:
                self.prices.append(handle_price(price))
                self.brands.append(brand) # Leverage the for loop to include brand names
    
    def create_df(self):
        self.df = pd.DataFrame(list(zip(self.products,self.prices,self.brands)),columns=['Product','Price','Brand'])
        self.df['Mls'] = self.df['Product'].map(get_mls)
        self.df['Price Per Liter'] = self.df['Price']/self.df['Mls']*1000
        self.df['Returnable'] = self.df['Product'].map(is_returnable)
        # Sort
        self.df = self.df.sort_values('Price Per Liter')
    
    def set_filters(self,wb=[],ub=[],r=['Yes','No'],mm=99999):
        self.wanted_brands = wb
        self.unwanted_brands = ub
        self.returnable = r
        self.max_mls = mm
        
    def apply_filters(self):
        # Conditions
        c0 = self.df['Brand'].isin(self.wanted_brands) if len(self.wanted_brands)>0 else self.df['Brand']==self.df['Brand']
        c1 = np.logical_not(self.df['Brand'].isin(self.unwanted_brands))
        c2 = self.df['Returnable'].isin(self.returnable)
        c3 = self.df['Mls']<=self.max_mls
        combined_cond = c0&c1&c2&c3
        # Apply condition
        self.filtered_df = self.df[combined_cond]

# Getting data

In [7]:
beer_scraper = BeerScraper()

In [8]:
%%time
beer_scraper.build_driver()

CPU times: user 22.8 ms, sys: 1.38 ms, total: 24.2 ms
Wall time: 1.46 s


In [9]:
%%time
beer_scraper.login()

CPU times: user 23 ms, sys: 7.18 ms, total: 30.2 ms
Wall time: 7.69 s


In [10]:
%%time
beer_scraper.get_available_brands()

CPU times: user 302 ms, sys: 48.7 ms, total: 351 ms
Wall time: 13.1 s


In [11]:
%%time
beer_scraper.scrape_data()

CPU times: user 1.4 s, sys: 33.8 ms, total: 1.43 s
Wall time: 1min 47s


In [12]:
%%time
beer_scraper.create_df()
beer_scraper.set_filters()
beer_scraper.apply_filters()

CPU times: user 26.7 ms, sys: 2.2 ms, total: 28.9 ms
Wall time: 23.4 ms


In [13]:
beer_scraper.filtered_df

Unnamed: 0,Product,Price,Brand,Mls,Price Per Liter,Returnable
37,Antarctica Subzero 473ml,2.69,Antarctica,473.0,5.687104,No
34,Antarctica Pilsen 1L | Apenas o líquido,6.09,Antarctica,1000.0,6.090000,Yes
17,Skol 473ml,2.99,Skol,473.0,6.321353,No
22,Skol Puro Malte 350ml,2.29,Skol,350.0,6.542857,No
19,Skol 350ml,2.29,Skol,350.0,6.542857,No
...,...,...,...,...,...,...
74,Wäls Trippel 375ml,19.99,Wäls,375.0,53.306667,No
123,Overhop Imperial IPA 500ml,26.90,Overhop,500.0,53.800000,No
108,Hocus Pocus IPA com Abacaxi 500ml,26.90,Hocus Pocus,500.0,53.800000,No
107,Hocus Pocus IPA 500ml,27.90,Hocus Pocus,500.0,55.800000,No


# Improving scraping time

As we can see above, it takes around 2 minutes to scrape the website, get all the data, and create the dataframe. This is far too long and most users would not like to wait all this time to get an answer on what is the cheapest beer.

We can improve this time by investigating what step is taking the longest. By using the '%%time' magic command, we can see that the 'scrape_data' method is responsible for almost all that time (1m48s). This is because we are doing several get requests, one for each available brand.

As the main goal of this program is to find cheap beers, we can assume that most users won't be interested in the high-end, premium beers. So we can filter out the most expensive brands by default, thus reducing the scrape time.

#### Filtering out most expensive brands

In order to filter out brands, it is necessary to define some subjective criteria. I choose to keep only the brands that have at least one beer that is within three times the price of the overall cheapest beer (in Price per Liter). In other words, if the cheapest beer of a particular brand is three times more expensive than the cheapest beer you can possibly get, we can assume that this is an expensive premium brand, and by deafult, we will not include it in our dataframe.

In [14]:
df = beer_scraper.df

In [15]:
# Rank the brands by their cheapest beer (in Price Per Liter)
df_ranked = df[['Brand','Price Per Liter']].groupby(by=['Brand']).min().sort_values('Price Per Liter')
df_ranked

Unnamed: 0_level_0,Price Per Liter
Brand,Unnamed: 1_level_1
Antarctica,5.687104
Skol,6.321353
Brahma,7.09
Bohemia,7.685714
Budweiser,9.069767
Serramalte,9.114286
Original,9.114286
Caracu,10.828571
Stella Artois,11.115242
Beck's,12.828571


In [16]:
# Get the cheapest beer, in Price Per Liter
cheapest_beer_price = df_ranked.iloc[0]['Price Per Liter']
cheapest_beer_price

5.687103594080338

In [17]:
# Define the threshold for the max price of the brands cheapest beer. If it is above this, the brand is considered expensive
threshold = cheapest_beer_price*3
threshold

17.061310782241016

In [18]:
expensive_brands = list(df_ranked[df_ranked['Price Per Liter']> threshold].index)

In [19]:
list(expensive_brands)

['Cervejaria Bohemia',
 'Hoegaarden',
 'Farra Bier',
 'Patagonia',
 'Noi',
 'Flamingo',
 'Motim',
 'Goose Island',
 'Kona',
 'Overhop',
 'Hocus Pocus',
 'Leffe',
 'Three Monkeys',
 'Franziskaner']

In [21]:
print(f'''In the time this analysis was made, from the {len(beer_scraper.available_brands)} brands available in the website, {len(expensive_brands)} were considered expensive.
This means we can expect the time spent in the "scrape_data" function to reduce by half (approx.) when
removing the expensive brands.
''')

In the time this analysis was made, from the 29 brands available in the website, 14 were considered expensive.
This means we can expect the time spent in the "scrape_data" function to reduce by half (approx.) when
removing the expensive brands.



#### Limitations

This analisys was based in a snapshot of the beers available at a fixed date (13/05/2021). The beer prices change overtime, and there are ocasional discounts, so it is possible that one of the brands that was labeled as "expensive" will eventually offer a beer that is within the defined threshold. However, by repeating this analysis a few times in different days and hours, the change was not very significant. Even if one of those brands can eventually offer a cheaper beer, its price can never compete with the cheapest brands.

# Improved Class

Let's change the class and run the scraper again, to see how the perfomance changes.

In [22]:
class BeerScraperImproved:
    def __init__(self):
        self.driver = None
        self.email = None
        self.password = None
        self.available_brands = []
        self.prices = []
        self.products = []
        self.brands = []
        self.df = None
        self.expensive_brands = ['Cervejaria Bohemia','Hoegaarden','Farra Bier','Patagonia','Noi','Flamingo','Wäls','Motim',
                                 'Goose Island','Kona','Overhop','Hocus Pocus','Leffe','Three Monkeys','Franziskaner']
        # Filters
        self.wanted_brands = []
        self.unwanted_brands = []
        self.returnable = ['Yes','No']
        self.max_mls = 99999
        self.filtered_df = None
        
    def build_driver(self):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=options)
    
    def login(self):
        # Login details
        login_url = 'https://www.ze.delivery/conta/entrar'
        self.email = "brunoprates@poli.ufrj.br" ############### Mudar depois
        self.password = "ze123456"
        
        # Enter login details in form
        self.driver.get(login_url)
        self.driver.implicitly_wait(6)
        self.driver.find_element_by_xpath("""//*[@id="login-mail-input-email"]""").send_keys(self.email)
        self.driver.find_element_by_xpath("""//*[@id="login-mail-input-password"]""").send_keys(self.password)

        # Press sign in button
        button = self.driver.find_element_by_xpath("""//*[@id="login-mail-button-sign-in"]""")
        self.driver.execute_script("arguments[0].click();", button)
        time.sleep(3) # Wait a couple seconds to complete the sign in
        
    def get_available_brands(self):
        url_brands = 'https://www.ze.delivery/produtos/categoria/cervejas'
        self.driver.get(url_brands)
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        available_brands_html = soup.find_all("h2", class_="css-l9heuk-shelfTitle")
        self.available_brands = [brand_html.text for brand_html in available_brands_html]
        
    def scrape_data(self):
        for brand in self.available_brands:
            if brand not in self.expensive_brands:
                #Get page HTML
                url = get_url(brand)
                self.driver.get(url)
                soup = BeautifulSoup(self.driver.page_source, "html.parser")

                #Find products and add to instance variable
                products_html = soup.find_all("h3", class_="css-krg860-productTitle")
                for product in products_html:
                    self.products.append(product.text)

                #Find prices and add to instance variable.
                prices_html = soup.find_all("div", class_="css-t89dhz-priceText")
                for price in prices_html:
                    self.prices.append(handle_price(price))
                    self.brands.append(brand) # Leverage the for loop to include brand names
    
    def create_df(self):
        self.df = pd.DataFrame(list(zip(self.products,self.prices,self.brands)),columns=['Product','Price','Brand'])
        self.df['Mls'] = self.df['Product'].map(get_mls)
        self.df['Price Per Liter'] = self.df['Price']/self.df['Mls']*1000
        self.df['Returnable'] = self.df['Product'].map(is_returnable)
        # Sort
        self.df = self.df.sort_values('Price Per Liter')
    
    def set_filters(self,wb=[],ub=[],r=['Yes','No'],mm=99999):
        self.wanted_brands = wb
        self.unwanted_brands = ub
        self.returnable = r
        self.max_mls = mm
        
    def apply_filters(self):
        # Conditions
        c0 = self.df['Brand'].isin(self.wanted_brands) if len(self.wanted_brands)>0 else self.df['Brand']==self.df['Brand']
        c1 = np.logical_not(self.df['Brand'].isin(self.unwanted_brands))
        c2 = self.df['Returnable'].isin(self.returnable)
        c3 = self.df['Mls']<=self.max_mls
        combined_cond = c0&c1&c2&c3
        # Apply condition
        self.filtered_df = self.df[combined_cond]

In [23]:
beer_scraper_2 = BeerScraperImproved()

In [24]:
%%time
beer_scraper_2.build_driver()

CPU times: user 0 ns, sys: 30.9 ms, total: 30.9 ms
Wall time: 228 ms


In [25]:
%%time
beer_scraper_2.login()

CPU times: user 27 ms, sys: 0 ns, total: 27 ms
Wall time: 6.43 s


In [26]:
%%time
beer_scraper_2.get_available_brands()

CPU times: user 222 ms, sys: 11.2 ms, total: 233 ms
Wall time: 13.3 s


In [27]:
%%time
beer_scraper_2.scrape_data()

CPU times: user 814 ms, sys: 10.3 ms, total: 825 ms
Wall time: 51.5 s


In [28]:
%%time
beer_scraper_2.create_df()
beer_scraper_2.set_filters()
beer_scraper_2.apply_filters()

CPU times: user 28.8 ms, sys: 2.38 ms, total: 31.2 ms
Wall time: 22.9 ms


As expected, the 'scrape_data' function ran in about half of the original time. Running everything and getting the final dataframe still takes a bit more than 1 minute, which is quite long, but acceptable.

# To-do

- Choose address
- Encrypt password