In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup as bs
from time import sleep
import pandas as pd

In [5]:
def list_problem_qtdy(company):
    """
    This function will extract the quantity os claims by problem category.
    """
    # Selenium configurations
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.page_load_strategy = 'eager'
    driver = webdriver.Chrome(options=options)

    # Filtering only mobile services categories
    category = '/?pagina=1&categoria=0000000000000067'

    base_url = "https://www.reclameaqui.com.br/empresa/" + company.lower() + "/lista-reclamacoes" + category
    driver.get(base_url)

    # Expand problem options
    button = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH,
                                                                               '//*[@id="filter-diderot"]/div[1]/div[4]/button')))
    
    button.click() 

    # Get problem label
    page_html = bs(driver.page_source, "html.parser")
    categories_html = page_html.find_all(class_='sc-1h9pg1g-5 hQqRTk')
    divs = categories_html[3].find_all('div', class_='sc-1h9pg1g-7 eGIjvG')
    
    problem_list = []
    for div in divs:
        problem_list.append([div['title'], div.find('label')['for']])
    
    return problem_list

In [None]:
def extract_claim_list(company, problem_item, ini_page, end_page):
    """
    This function will extract list of titles and link.
    """
    # Selenium configurations
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.page_load_strategy = 'eager'
    driver = webdriver.Chrome(options=options)

    base_url = "https://www.reclameaqui.com.br/empresa/" + company.lower()
    category = '&categoria=0000000000000067'
    cat_problem = '&problema=' + str(problem_item[1])
    print(f'Problem category: {problem_item[0].split(" (")[0]}')
    title_link = []

    # Extract data
    for page_number in range(ini_page, end_page+1):
        url = base_url + "/lista-reclamacoes/?pagina=" + str(page_number) + category + cat_problem
        driver.get(url)
        sleep(1.5)
        page_html = bs(driver.page_source, "html.parser")
        claims_html = page_html.find_all(class_='sc-1pe7b5t-0 iQGzPh')

        for element in claims_html:
            element_title_link = [element.find('h4')['title'], element.find('a')['href']]
            title_link.append(element_title_link)
    
    driver.quit()
    return title_link

In [4]:
def extract_company(company, problem_list, top_n, ini_page, end_page):
    company_df = pd.DataFrame()
    for problem in problem_list[:top_n]:
        title_link = extract_claim_list(company, problem, ini_page, end_page)
        temp = pd.DataFrame(title_link, columns=['title', 'link'])
        temp['problem'] = problem[0].split(" (")[0]
        company_df = pd.concat([company_df, temp]).reset_index(drop=True)

        company_df.to_csv(company + '_temp.csv', index=False)

    return company_df

In [2]:
def add_description(filename, ini_line=0):
    """
    This function vistit every link on file e gets the description
    """
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    options.page_load_strategy = 'eager'
    driver = webdriver.Chrome(options=options)

    temp_dataframe = pd.read_csv(filename)
    temp_name = 'full_' + filename
    
    if 'description' not in temp_dataframe.columns:
        temp_dataframe['description'] = ""

    index = ini_line
    for url in temp_dataframe.loc[int(ini_line):, 'link']:
        
        # Reset browser and save file each 50 requests
        if index%50 == 0 and index!=0:
            print(f"Sample: {index}")
            driver.quit()
            driver = webdriver.Chrome(options=options)
            temp_dataframe.to_csv(temp_name, index=False)

        full_url = "https://www.reclameaqui.com.br" + url
        driver.get(full_url)
        sleep(1.3)
        page_html = bs(driver.page_source, "html.parser")
        try:
            description = page_html.find('p', {'data-testid': 'complaint-description'}).get_text()
        except:
            description = ""

        temp_dataframe.at[index, 'description'] = description
        index += 1
    
    print(f"Salvando dataframe: {temp_name}")
    temp_dataframe.to_csv(temp_name, index=False)
    driver.quit()

In [8]:
# Extract problem category for each mobile operator
problem_list_claro = list_problem_qtdy('claro')
problem_list_tim = list_problem_qtdy('tim-celular')
problem_list_vivo = list_problem_qtdy('vivo-celular-fixo-internet-tv')
problem_list_oi = list_problem_qtdy('oi-movel-fixo-tv')

In [None]:
# Check size of lists
lists = [problem_list_claro, problem_list_tim,
         problem_list_vivo, problem_list_oi]

for i in lists:
    print(len(i))

In [18]:
# Choose parameters
top_n = 25   # Top problems 
ini = 1      # First page problem
end = 50     # Last page problem

In [None]:
# Extract title, link and category from Claro
claro_df = extract_company(company='claro', problem_list=problem_list_claro,
                           top_n=top_n, ini_page=ini, end_page=end)
#claro_df.to_csv('reclame_aqui_main_claro.csv', index=False)

In [None]:
# Extract title, link and category from Tim
tim_df = extract_company(company='tim-celular', problem_list=problem_list_tim,
                         top_n=top_n, ini_page=ini, end_page=end)
#tim_df.to_csv('reclame_aqui_main_tim.csv', index=False)

In [None]:
# Extract title, link and category from Vivo
vivo_df = extract_company(company='vivo-celular-fixo-internet-tv', problem_list=problem_list_vivo,
                          top_n=top_n, ini_page=ini, end_page=end)
#vivo_df.to_csv('reclame_aqui_main_vivo.csv', index = False)

In [None]:
# Extract title, link and category from Oi
oi_df = extract_company(company='oi-movel-fixo-tv', problem_list=problem_list_oi,
                        top_n=top_n, ini_page=ini, end_page=end)
#oi_df.to_csv('reclame_aqui_main_oi.csv', index = False)

In [3]:
add_description('selected_problems_vivo.csv')

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=118.0.5993.70)
Stacktrace:
	GetHandleVerifier [0x00007FF714068EF2+54786]
	(No symbol) [0x00007FF713FD5612]
	(No symbol) [0x00007FF713E8A64B]
	(No symbol) [0x00007FF713E6A9F5]
	(No symbol) [0x00007FF713EF0887]
	(No symbol) [0x00007FF713F043CF]
	(No symbol) [0x00007FF713EEBC43]
	(No symbol) [0x00007FF713EC0941]
	(No symbol) [0x00007FF713EC1B84]
	GetHandleVerifier [0x00007FF7143B7F52+3524194]
	GetHandleVerifier [0x00007FF71440D800+3874576]
	GetHandleVerifier [0x00007FF714405D7F+3843215]
	GetHandleVerifier [0x00007FF714105086+694166]
	(No symbol) [0x00007FF713FE0A88]
	(No symbol) [0x00007FF713FDCA94]
	(No symbol) [0x00007FF713FDCBC2]
	(No symbol) [0x00007FF713FCCC83]
	BaseThreadInitThunk [0x00007FFBE2737344+20]
	RtlUserThreadStart [0x00007FFBE40626B1+33]
