# Scraping reviews from Reclame Aqui website

## Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
#import matplotlib.pyplot as plt
#%matplotlib inline

## Create driver

In [2]:
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)

## Define Functions

In [3]:
def create_url(company, page):
    return f'https://www.reclameaqui.com.br/empresa/{company}/lista-reclamacoes/?pagina={page}'

In [29]:
def get_soup(driver,url):
    driver.get(url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    print(soup)
    return soup

In [5]:
def get_complaints(soup):
    complaints = soup.find_all(class_="link-complain-id-complains")
    return complaints

In [6]:
def get_titles(complaints):
    title_list = []
    for i in range(len(complaints)):
        title_list.append(complaints[i].text.strip())
    return title_list

In [7]:
def get_links(complaints):
    link_list = []
    start_url = 'https://www.reclameaqui.com.br'
    for i in range(len(complaints)):
        link = (start_url + complaints[i].get('href'))
        link_list.append(link)
    return link_list

In [8]:
def get_content(soup):
    try:
        content = soup.find_all('p', class_="ng-binding")[9].text.strip()
        return content
    except:
        return 'NA'

In [9]:
def get_titles_and_links(company, num_pages, driver):
    titles = []
    links = []
    for i in range(1, num_pages+1):
        page_url = create_url(company, i)
        page_soup = get_soup(driver, page_url)
        page_complaints = get_complaints(page_soup)
        page_titles = get_titles(page_complaints)
        page_links = get_links(page_complaints)
        
        titles += page_titles
        links += page_links
        
    return titles, links

In [10]:
def get_content_list(driver, links):
    content_list = []
    for i in range(len(links)):
        url = links[i]
        soup = get_soup(driver,url)
        print(f'Retrieved {i+1} of {len(links)}')
        content = get_content(soup)
        content_list.append(content)
    return content_list

## Test Functions

In [30]:
titles, links = get_titles_and_links('upnid',2,driver)

<html class="ng-scope" lang="pt-BR" ng-app="rawebApp"><head><style type="text/css">[uib-typeahead-popup].dropdown-menu{display:block;}</style><style type="text/css">.uib-time input{width:50px;}</style><style type="text/css">[uib-tooltip-popup].tooltip.top-left > .tooltip-arrow,[uib-tooltip-popup].tooltip.top-right > .tooltip-arrow,[uib-tooltip-popup].tooltip.bottom-left > .tooltip-arrow,[uib-tooltip-popup].tooltip.bottom-right > .tooltip-arrow,[uib-tooltip-popup].tooltip.left-top > .tooltip-arrow,[uib-tooltip-popup].tooltip.left-bottom > .tooltip-arrow,[uib-tooltip-popup].tooltip.right-top > .tooltip-arrow,[uib-tooltip-popup].tooltip.right-bottom > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.top-left > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.top-right > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.bottom-left > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.bottom-right > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.left-top > .tooltip-arrow,[uib-tooltip-html-popup].

<html class="ng-scope" lang="pt-BR" ng-app="rawebApp"><head><style type="text/css">[uib-typeahead-popup].dropdown-menu{display:block;}</style><style type="text/css">.uib-time input{width:50px;}</style><style type="text/css">[uib-tooltip-popup].tooltip.top-left > .tooltip-arrow,[uib-tooltip-popup].tooltip.top-right > .tooltip-arrow,[uib-tooltip-popup].tooltip.bottom-left > .tooltip-arrow,[uib-tooltip-popup].tooltip.bottom-right > .tooltip-arrow,[uib-tooltip-popup].tooltip.left-top > .tooltip-arrow,[uib-tooltip-popup].tooltip.left-bottom > .tooltip-arrow,[uib-tooltip-popup].tooltip.right-top > .tooltip-arrow,[uib-tooltip-popup].tooltip.right-bottom > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.top-left > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.top-right > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.bottom-left > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.bottom-right > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.left-top > .tooltip-arrow,[uib-tooltip-html-popup].

In [12]:
titles

[]

In [13]:
links

[]

In [14]:
contents = get_content_list(driver,links)

In [15]:
contents

[]

## Create class

In [16]:
class ReclameAqui:
    
    def __init__(self, company, num_pages):
        self.company = company
        self.num_pages = num_pages
        self.driver = None
        self.titles = []
        self.links = []
        
    def build_driver(self):
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=options)
        
    def get_titles_and_links(self):
        
        if not self.driver:
            print('You need to specify a driver.')
            return

        for i in range(1, self.num_pages+1):
            page_url = create_url(self.company, i)
            page_soup = get_soup(self.driver, page_url)
            page_complaints = get_complaints(page_soup)
            page_titles = get_titles(page_complaints)
            page_links = get_links(page_complaints)

            self.titles += page_titles
            self.links += page_links

        return self.titles, self.links

In [17]:
ra = ReclameAqui('upnid',2)

In [18]:
ra.driver

In [19]:
ra.get_titles_and_links()

You need to specify a driver.


In [20]:
ra.build_driver()

In [21]:
ra.driver

<selenium.webdriver.chrome.webdriver.WebDriver (session="f29d57204693413ef7184a4ded788978")>

In [22]:
ra.get_titles_and_links()

([], [])

## Requests-HTML

In [23]:
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession

In [24]:
url = create_url('upnid',2)
url

'https://www.reclameaqui.com.br/empresa/upnid/lista-reclamacoes/?pagina=2'

In [25]:
session = AsyncHTMLSession()

r = await session.get(url)

In [26]:
await r.html.arender()

In [27]:
r.html.full_text

'[uib-typeahead-popup].dropdown-menu{display:block;}.uib-time input{width:50px;}[uib-tooltip-popup].tooltip.top-left > .tooltip-arrow,[uib-tooltip-popup].tooltip.top-right > .tooltip-arrow,[uib-tooltip-popup].tooltip.bottom-left > .tooltip-arrow,[uib-tooltip-popup].tooltip.bottom-right > .tooltip-arrow,[uib-tooltip-popup].tooltip.left-top > .tooltip-arrow,[uib-tooltip-popup].tooltip.left-bottom > .tooltip-arrow,[uib-tooltip-popup].tooltip.right-top > .tooltip-arrow,[uib-tooltip-popup].tooltip.right-bottom > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.top-left > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.top-right > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.bottom-left > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.bottom-right > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.left-top > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.left-bottom > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.right-top > .tooltip-arrow,[uib-tooltip-html-popup].tooltip.right-bottom >

In [28]:
r.html.find("link-complain-id-complains")

[]