# Beer scraper - Zé Delivery

This project scrapes beer prices and other info from the delivery website "Zé Delivery", and returns the cheapest option possible, subject to constraints defined by the user.

# Imports

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from unidecode import unidecode

import pandas as pd
import numpy as np
import re

# Data Collection

### Build Driver

In [2]:
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)

### Login to website

In [3]:
# Login url
login_url = 'https://www.ze.delivery/conta/entrar'

In [4]:
# Login details
email = "brunoprates@poli.ufrj.br"
password = "ze123456"

In [5]:
# Enter login details in form
driver.get(login_url)
driver.implicitly_wait(6)
driver.find_element_by_xpath("""//*[@id="login-mail-input-email"]""").send_keys(email)
driver.find_element_by_xpath("""//*[@id="login-mail-input-password"]""").send_keys(password)

# Press sign in button
button = driver.find_element_by_xpath("""//*[@id="login-mail-button-sign-in"]""")
driver.execute_script("arguments[0].click();", button)
time.sleep(3) # Wait a couple seconds to complete the sign in

### Get available brands

In [6]:
def get_available_brands():
    url_brands = 'https://www.ze.delivery/produtos/categoria/cervejas'
    driver.get(url_brands)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    available_brands_html = soup.find_all("h2", class_="css-l9heuk-shelfTitle")
    available_brands = [brand_html.text for brand_html in available_brands_html]
    return available_brands

In [7]:
available_brands = get_available_brands()
available_brands

['Brahma',
 'Skol',
 'Budweiser',
 'Antarctica',
 'Original',
 'Stella Artois',
 "Beck's",
 'Corona',
 'Bohemia',
 'Retornáveis',
 'Serramalte',
 'Colorado',
 'Hoegaarden',
 'Wäls',
 'Leffe',
 'Patagonia',
 'Caracu',
 'Goose Island',
 'Franziskaner',
 'Kona',
 'Cervejaria Bohemia',
 'Farra Bier',
 'Flamingo',
 'Hocus Pocus',
 'Motim',
 'Noi',
 'Overhop',
 'Three Monkeys']

### Get urls to be scraped

In [8]:
def get_url(brand):
    root_url = 'https://www.ze.delivery/produtos/marca/'
    brand = brand.lower()
    brand = unidecode(brand)
    brand = brand.replace("'", "")
    brand = brand.replace(" ", "-")
    return root_url+brand

### Get beer details

In [9]:
def get_prices(prices_html):
    prices = []
    for price in prices_html:
        price = price.text[3:]
        price = price.replace(',','.')
        prices.append(float(price))
    return prices

In [10]:
def get_products(products_html):
    return [product.text for product in products_html]

In [11]:
def OLD_get_mls(products):
    mls=[]
    pattern1 = r"(\d+)ml"
    pattern2 = r"(\d+)L"  # Erra quando eh 1.5L
    for product in products:
        try:
            ml = int(re.findall(pattern1, product)[0])
        except:
            try:
                ml = int(re.findall(pattern2, product)[0])*1000
            except:
                ml = 0#np.Nan
        mls.append(ml)
    return mls

In [77]:
def get_mls(product):
    pattern1 = r"(\d+)ml"
    pattern2 = r"(\d*\.*\d+)L"
    try:
        ml = float(re.findall(pattern1, product)[0])
    except:
        try:
            ml = int(float(re.findall(pattern2, product)[0])*1000)
        except:
            ml = np.nan
    return ml

In [13]:
def get_beers_details(brand):
    url = get_url(brand)
    
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    products_html = soup.find_all("h3", class_="css-krg860-productTitle")
    products = get_products(products_html)
    
    prices_html = soup.find_all("div", class_="css-t89dhz-priceText")
    prices = get_prices(prices_html)
    
    brand_list = [brand for i in range(len(products))]

    return products, prices, brand_list

### Create the Dataframe

In [14]:
def is_returnable(product):
    if product.find('Apenas o ') == -1:
        return 'No'
    return 'Yes'

In [15]:
def create_df(available_brands, num_brands=len(available_brands)):
    total_products = []
    total_prices = []
    total_brands = []
    
    for i in range(num_brands):
        products, prices, brand_list = get_beers_details(available_brands[i])
        total_products += products
        total_prices += prices
        total_brands += brand_list
        
    df = pd.DataFrame(list(zip(total_products,total_prices,total_brands)),columns=['Product','Price','Brand'])
    df['Mls'] = df['Product'].map(get_mls)
    df['Price Per Liter'] = df['Price']/df['Mls']*1000
    df = df.sort_values('Price Per Liter')
    df['Returnable'] = df['Product'].map(is_returnable)
    
    return df

In [16]:
df = create_df(available_brands)

In [17]:
df

Unnamed: 0,Product,Price,Brand,Mls,Price Per Liter,Returnable
40,Antarctica Subzero 473ml,2.69,Antarctica,473,5.687104,No
35,Antarctica Pilsen 1L | Apenas o líquido,6.09,Antarctica,1000,6.090000,Yes
18,Skol 473ml,2.99,Skol,473,6.321353,No
20,Skol 350ml,2.29,Skol,350,6.542857,No
23,Skol Puro Malte 350ml,2.29,Skol,350,6.542857,No
...,...,...,...,...,...,...
77,Wäls Trippel 375ml,19.99,Wäls,375,53.306667,No
110,Hocus Pocus IPA com Abacaxi 500ml,26.90,Hocus Pocus,500,53.800000,No
129,Overhop Imperial IPA 500ml,26.90,Overhop,500,53.800000,No
109,Hocus Pocus IPA 500ml,27.90,Hocus Pocus,500,55.800000,No


### Define Filters

In [18]:
wanted_brands = []
unwanted_brands = []
returnable = []
max_mls = None

In [24]:
df[df['Returnable']=='Yes']

Unnamed: 0,Product,Price,Brand,Mls,Price Per Liter,Returnable
35,Antarctica Pilsen 1L | Apenas o líquido,6.09,Antarctica,1000,6.09,Yes
33,Antarctica Pilsen 300ml | Apenas o Líquido,1.99,Antarctica,300,6.633333,Yes
14,Brahma Chopp 1L | Apenas o Líquido - Unidade,7.09,Brahma,1000,7.09,Yes
0,Brahma Chopp 300ml | Apenas o Líquido,2.39,Brahma,300,7.966667,Yes
60,Bohemia 300ml | Apenas o Líquido,2.39,Bohemia,300,7.966667,Yes
36,Antarctica Pilsen 600ml | Apenas o Líquido,5.29,Antarctica,600,8.816667,Yes
22,Skol 600ml | Apenas o Líquido,5.69,Skol,600,9.483333,Yes
7,Brahma Chopp 600ml | Apenas o Líquido - Unidade,6.29,Brahma,600,10.483333,Yes
24,Skol Puro Malte 600ml | Apenas o Líquido,6.29,Skol,600,10.483333,Yes
58,Bohemia 600ml | Apenas o Líquido,6.59,Bohemia,600,10.983333,Yes


In [25]:
df[df['Brand']=='Brahma']

Unnamed: 0,Product,Price,Brand,Mls,Price Per Liter,Returnable
10,Brahma Chopp 269ml,1.87,Brahma,269,6.951673,No
14,Brahma Chopp 1L | Apenas o Líquido - Unidade,7.09,Brahma,1000,7.09,Yes
9,Brahma Chopp 350ml,2.69,Brahma,350,7.685714,No
17,Brahma Malzbier 350ml,2.69,Brahma,350,7.685714,No
5,Brahma Chopp 473ml,3.69,Brahma,473,7.801268,No
0,Brahma Chopp 300ml | Apenas o Líquido,2.39,Brahma,300,7.966667,Yes
2,Brahma Duplo Malte 350ml,3.19,Brahma,350,9.114286,No
12,Brahma Chopp 355ml - Unidade,3.29,Brahma,355,9.267606,No
6,Brahma Malzbier 355ml,3.29,Brahma,355,9.267606,No
8,Brahma Zero 350ml - Unidade,3.39,Brahma,350,9.685714,No
