## Dependências

Instalação no jupyter:

```
import sys
!{sys.executable} -m pip install requests bs4 selenium
```

In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
import json

## Obtendo os nomes e endereços das cidades

Apenas é necessario o código de uma cidade para acessar a página de atrações desta. Mesmo estando a parte do "cont" errada, será feito um redirecionamento para a página correta com base no código.

In [5]:
base = "https://www.tripadvisor.com.br/Attractions-g"
cont = "-Activities-Campo_Grande_State_of_Mato_Grosso_do_Sul.html"

codes = []
desc = []
cities = []
for code in range(303450, 303460):
    req = get(base + str(code) + cont)
    soup = BeautifulSoup(req.content, 'html.parser')
    try:
        cities.append(soup.find("div", class_ = "attractions-attraction-overview-main-PoiHeader__title--3Mpd9").text.partition(":")[0])
        s = req.url.split("-")
        codes.append(s[1])
        desc.append(s[3].partition(".")[0])
    except:
        continue

In [6]:
try:
    df = pd.read_csv('links.csv', delimiter=",", encoding = 'cp1252') #abrindo o arquivo
except:
    df = None


In [7]:
links_df = pd.concat([df, pd.DataFrame({'City': cities, 'Code': codes, 'Description': desc})]).drop_duplicates() # evitar entradas duplicadas
links_df.to_csv('links.csv', index = False, encoding = 'cp1252')

links_df

Unnamed: 0,City,Code,Description
0,Uberlândia,g303400,Uberlandia_State_of_Minas_Gerais
1,Varginha,g303401,Varginha_State_of_Minas_Gerais
2,São Paulo,g303631,Sao_Paulo_State_of_Sao_Paulo
3,Belém,g303404,Belem_State_of_Para
4,Belo Horizonte,g303374,Belo_Horizonte_State_of_Minas_Gerais
5,Campinas,g303605,Campinas_State_of_Sao_Paulo
6,Diamantina,g303380,Diamantina_State_of_Minas_Gerais
7,Fortaleza,g303293,Fortaleza_State_of_Ceara
0,Limeira,g303617,Limeira_State_of_Sao_Paulo
1,Londrina,g303451,Londrina_State_of_Parana


## Obtenção de dados de atrações do TripAdvisor

In [4]:
def getAttractions(code, desc, ref, categories, reviews):
    driver = webdriver.Safari() # mude pro seu navegador favorito
    driver.get("https://www.tripadvisor.com.br/Attractions-" + str(code) + "-Activities-" + desc + ".html")
    time.sleep(1)
    
    # Obtendo as 30 primeiras atrações 
    # Apenas as 10 primeiras estao visiveis inicialmente
    # Se fez necessario simular o ato de "clique" para exibir o resto delas

    buttons = driver.find_elements_by_class_name("attractions-attraction-overview-main-TopPOIs__see_more--2Vsb-")
    for button in buttons:
        if button.is_displayed():
            driver.execute_script("arguments[0].click();", button)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    info = soup.find_all("div", class_ = "attractions-attraction-overview-pois-PoiInfo__info--239IR")

    for attraction in info:
        nl = attraction.find("a", class_ = "attractions-attraction-overview-pois-PoiInfo__name--SJ0a4")
        #names.append(nl.text)
        ref.append(nl.get("href"))
        categories.append(attraction.find("span", class_ = "_21qUqkJx").text)
        try:
            reviews.append(attraction.find("span", class_ = "reviewCount styleguide-bubble-rating-BubbleRatingWithReviewCount__reviewCount--37tMc").text.split()[0])
        except:
            reviews.append("N/A")

    driver.quit()
    
    # As próximas atrações estão em outra página que segue um formato diferente
    oa = 30
    if(len(names) == 30):
        while True:
            req = get("https://www.tripadvisor.com.br/Attractions-" + str(code) + "-Activities-oa" + str(oa) + "-" + desc + ".html")
            soup = BeautifulSoup(req.content, 'html.parser')
            info = soup.find_all("div", class_ = "flexible")
            for attraction in info:
                nl = attraction.find("div", class_ = "tracking_attraction_title listing_title").find("a")
                #names.append(nl.text)
                ref.append(nl.get("href"))
                try: # eu queria saber lancar excecoes decentemente
                    categories.append(attraction.find("span", class_ = "matchedTag noTagImg").text)
                except:
                    categories.append("N/A")
                try:
                    reviews.append(attraction.find("span", class_ = "more").text.split()[0])
                except:
                    reviews.append("N/A")
            if(len(info) < 30):
                break
            else:
                oa += 30
                

In [52]:
def getAttractionInfo(ref, names, address, ratings, exc, vg, avg, poor, ter, certified):    
    i = 1
    t = len(ref)
    for page in ref:
        d, r = False, False
        req = get("https://www.tripadvisor.com.br" + page)
        #if (i % 4) == 0:
            #print("Progress: " + str(i) + "/" + str(t))
        soup = BeautifulSoup(req.content, 'html.parser')

        try:
            names.append(soup.find("h1", class_ = "ui_header h1").text)
        except:
            names.append("N/A")
            
        try:
            address.append(soup.find("span", class_ = "ui_icon map-pin-fill attractions-contact-card-ContactCard__icon--8F8Q9").text)
        except:
            address.append("N/A")

        try:
            ratings.append(soup.find("span", class_ = "overallRating").text)
            r = True
            dist = soup.find_all("span", class_ = "row_count row_cell")
        except:
            ratings.append("N/A")
            if r is False:
                exc.append("N/A")
                vg.append("N/A")
                avg.append("N/A")
                poor.append("N/A")
                ter.append("N/A")

        if r is True:
            exc.append(dist[0].text)
            vg.append(dist[1].text)
            avg.append(dist[2].text)
            poor.append(dist[3].text)
            ter.append(dist[4].text)

        try:
            soup.find("div", class_ = "attractions-attraction-detail-about-card-Award__award_text--1OfPx").text
            certified.append(True)
        except:
            certified.append(False)

        i += 1
    print("Done")

In [5]:
links_df = pd.read_csv("links.csv", delimiter = ",", encoding = 'cp1252')

for index, row in links_df.iterrows():
    print("Current city: " + row['City'])
    # informacoes basicas
    names = []
    ref = []
    categories = []
    reviews = []

    # informacoes adicionais
    address = []
    days = []
    hours = []
    ratings = []

    # distribuicao das avaliacoes
    exc = []
    vg = []
    avg = []
    poor = []
    ter = []

    # certificação do TripAdvisor
    certified = []
    
    # geolocalizacao
    lat = []
    lon = []
    display_name = []
    
    print("Getting attractions...")
    getAttractions(row['Code'], row['Description'], names, ref, categories, reviews)
    print("Getting attractions informations...")
    getAttractionInfo(ref, address, days, hours, ratings, exc, vg, avg, poor, ter, certified)
    print("Getting geolocatization...")
    codeLocation(address, row['City'])
    # Cria o dataframe e o exporta (deixar o encoding como cp1252 para permitir acentos)
    df = pd.DataFrame({'Ref': ref, 'Name': names, 'Category': categories, "Address": address, "Days": days, "Hours": hours, 'Number of Reviews': reviews, 'Rating': ratings, 'Excellent': exc, 'Very Good': vg, 'Average': avg, 'Poor': poor, 'Terrible': ter, 'Certified': certified, 'Latitude': lat, 'Longitude': lon, 'display_name': display_name})
    df.to_csv("" + row['Description'] + ".csv", index = False, encoding = 'cp1252')
    print("Done!")

Current city: Uberlândia
Getting attractions...
.
.
.
.
.
.
.
.
.
0
1
2
3
4
5
6
7
8
9
0
1
2
3
4
5
6
7
8
9
0
Getting attractions informations...


NameError: name 'getAttractionInfo' is not defined

## Obtendo geolocalização

Para poder trabalhar com a distância é necessário converter o endereço de cada atração com sua respectiva geolocalização.

In [42]:
x = fowardGeocoding("Praia Manaira, João Pessoa").text

In [66]:
json.loads(x)[0]['lat']
#y[0]

'-7.1045772'