# Beer scraper - Zé Delivery

This project scrapes beer prices and other info from the delivery website "Zé Delivery", and returns the cheapest option possible, subject to constraints defined by the user.

# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from unidecode import unidecode

import pandas as pd
import numpy as np
import re

# Auxiliary functions

In [2]:
# This function creates the url string for a particular brand
def get_url(brand):
    root_url = 'https://www.ze.delivery/produtos/marca/'
    brand = brand.lower()
    brand = unidecode(brand)
    brand = brand.replace("'", "")
    brand = brand.replace(" ", "-")
    return root_url+brand

In [3]:
# This function takes the price in the format that it is found in the HTML, and convert to a float.
def handle_price(price):
    price = price.text[3:]
    price = price.replace(',','.')
    return float(price)

In [4]:
# This function get the number of mls in a product
def get_mls(product):
    pattern1 = r"(\d+)ml"
    pattern2 = r"(\d*\.*\d+)L"
    try:
        ml = float(re.findall(pattern1, product)[0])
    except:
        try:
            ml = int(float(re.findall(pattern2, product)[0])*1000)
        except:
            ml = np.nan
    return ml

In [5]:
# This function checks if the product is returnable, by looking at its name
def is_returnable(product):
    if product.find('Apenas o ') == -1:
        return 'No'
    return 'Yes'

# Class

In [6]:
class BeerScraper:
    def __init__(self):
        self.driver = None
        self.email = None
        self.password = None
        self.available_brands = []
        self.prices = []
        self.products = []
        self.brands = []
        self.df = None
        # Filters
        self.wanted_brands = []
        self.unwanted_brands = []
        self.returnable = ['Yes','No']
        self.max_mls = 99999
        self.filtered_df = None
        
    def build_driver(self):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=options)
    
    def login(self):
        # Login details
        login_url = 'https://www.ze.delivery/conta/entrar'
        self.email = "brunoprates@poli.ufrj.br" ############### Mudar depois
        self.password = "ze123456"
        
        # Enter login details in form
        self.driver.get(login_url)
        self.driver.implicitly_wait(6)
        self.driver.find_element_by_xpath("""//*[@id="login-mail-input-email"]""").send_keys(self.email)
        self.driver.find_element_by_xpath("""//*[@id="login-mail-input-password"]""").send_keys(self.password)

        # Press sign in button
        button = self.driver.find_element_by_xpath("""//*[@id="login-mail-button-sign-in"]""")
        self.driver.execute_script("arguments[0].click();", button)
        time.sleep(3) # Wait a couple seconds to complete the sign in
        
    def get_available_brands(self):
        url_brands = 'https://www.ze.delivery/produtos/categoria/cervejas'
        self.driver.get(url_brands)
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        available_brands_html = soup.find_all("h2", class_="css-l9heuk-shelfTitle")
        self.available_brands = [brand_html.text for brand_html in available_brands_html]
        
    def scrape_data(self):
        for brand in self.available_brands:
            #Get page HTML
            url = get_url(brand)
            self.driver.get(url)
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            
            #Find products and add to instance variable
            products_html = soup.find_all("h3", class_="css-krg860-productTitle")
            for product in products_html:
                self.products.append(product.text)
            
            #Find prices and add to instance variable.
            prices_html = soup.find_all("div", class_="css-t89dhz-priceText")
            for price in prices_html:
                self.prices.append(handle_price(price))
                self.brands.append(brand) # Leverage the for loop to include brand names
    
    def create_df(self):
        self.df = pd.DataFrame(list(zip(self.products,self.prices,self.brands)),columns=['Product','Price','Brand'])
        self.df['Mls'] = self.df['Product'].map(get_mls)
        self.df['Price Per Liter'] = self.df['Price']/self.df['Mls']*1000
        self.df['Returnable'] = self.df['Product'].map(is_returnable)
        # Sort
        self.df = self.df.sort_values('Price Per Liter')
    
    def set_filters(self,wb=[],ub=[],r=['Yes','No'],mm=99999):
        self.wanted_brands = wb
        self.unwanted_brands = ub
        self.returnable = r
        self.max_mls = mm
        
    def apply_filters(self):
        # Conditions
        c0 = self.df['Brand'].isin(self.wanted_brands) if len(self.wanted_brands)>0 else self.df['Brand']==self.df['Brand']
        c1 = np.logical_not(self.df['Brand'].isin(self.unwanted_brands))
        c2 = self.df['Returnable'].isin(self.returnable)
        c3 = self.df['Mls']<=self.max_mls
        combined_cond = c0&c1&c2&c3
        # Apply condition
        self.filtered_df = self.df[combined_cond]

# Getting data

In [7]:
a = BeerScraper()

In [8]:
%%time
a.build_driver()

CPU times: user 3.87 ms, sys: 25.6 ms, total: 29.5 ms
Wall time: 1.29 s


In [9]:
%%time
a.login()

CPU times: user 20.9 ms, sys: 10.4 ms, total: 31.3 ms
Wall time: 7.27 s


In [10]:
%%time
a.get_available_brands()

CPU times: user 245 ms, sys: 9.61 ms, total: 254 ms
Wall time: 14.3 s


In [11]:
%%time
a.scrape_data()

CPU times: user 1.53 s, sys: 69.5 ms, total: 1.6 s
Wall time: 1min 48s


In [12]:
%%time
a.create_df()

CPU times: user 15 ms, sys: 11.2 ms, total: 26.2 ms
Wall time: 20.4 ms


In [13]:
%%time
a.set_filters()

CPU times: user 227 µs, sys: 46 µs, total: 273 µs
Wall time: 301 µs


In [14]:
%%time
a.apply_filters()

CPU times: user 0 ns, sys: 6.45 ms, total: 6.45 ms
Wall time: 5.76 ms


In [15]:
a.filtered_df

Unnamed: 0,Product,Price,Brand,Mls,Price Per Liter,Returnable
35,Antarctica Subzero 473ml,2.69,Antarctica,473.0,5.687104,No
32,Antarctica Pilsen 1L | Apenas o líquido,6.09,Antarctica,1000.0,6.090000,Yes
16,Skol 473ml,2.99,Skol,473.0,6.321353,No
18,Skol 350ml,2.29,Skol,350.0,6.542857,No
20,Skol Puro Malte 350ml,2.29,Skol,350.0,6.542857,No
...,...,...,...,...,...,...
70,Wäls Trippel 375ml,19.99,Wäls,375.0,53.306667,No
118,Overhop Imperial IPA 500ml,26.90,Overhop,500.0,53.800000,No
103,Hocus Pocus IPA com Abacaxi 500ml,26.90,Hocus Pocus,500.0,53.800000,No
102,Hocus Pocus IPA 500ml,27.90,Hocus Pocus,500.0,55.800000,No


# Improving scraping time

As we can see above, it takes around 2 minutes to scrape the website, get all the data, and create the dataframe. This is far too long and most users would not like to wait all this time to get an answer on what is the cheapest beer.

We can improve this time by investigating what step is taking the longest. By using the '%%time' magic command, we can see that the 'scrape_data' method is responsible for almost all that time (1m48s). This is because we are doing several get requests, one for each available brand.

As the main goal of this program is to find cheap beers, we can assume that most users won't be interested in the high-end, premium beers. So we can filter out the most expensive brands by default, thus reducing the scrape time.

#### Filtering out most expensive brands

The criteria to filter out brands will be that all brands that have their cheapest beer (in Prices per Liter)

In [16]:
df = a.df

In [17]:
# Rank the brands by their best Price Per Liter
df_ranked = df[['Brand','Price Per Liter']].groupby(by=['Brand']).min().sort_values('Price Per Liter')
df_ranked

Unnamed: 0_level_0,Price Per Liter
Brand,Unnamed: 1_level_1
Antarctica,5.687104
Skol,6.321353
Brahma,7.09
Bohemia,7.114286
Budweiser,8.625793
Original,9.114286
Serramalte,9.114286
Stella Artois,10.408922
Caracu,10.828571
Beck's,12.828571


In [18]:
df_ranked.index[:7]

Index(['Antarctica', 'Skol', 'Brahma', 'Bohemia', 'Budweiser', 'Original',
       'Serramalte'],
      dtype='object', name='Brand')

In [19]:
len(df2)

NameError: name 'df2' is not defined