# TripAdvisor Scraping

### Librerías

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

### Obtención del HTML

In [8]:
url = "https://www.tripadvisor.com/Restaurant_Review-g60795-d433599-Reviews-Ristorante_Pesto-Philadelphia_Pennsylvania.html"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/120.0.0.0 Safari/537.36"}
options = Options()
options.add_argument("--headless=new")  # Nueva versión de headless mode
options.add_argument("--disable-gpu")  # Desactiva el uso de GPU
options.add_argument("--no-sandbox")  # Evita problemas en servidores
options.add_argument("--disable-dev-shm-usage")  # Evita errores en entornos con poca memoria compartida
options.add_argument("--blink-settings=imagesEnabled=false")  # No carga imágenes para optimizar
options.add_argument("--disable-extensions")  # Desactiva extensiones innecesarias
options.add_argument("--disable-infobars")  # Evita que muestre avisos de automatización
options.add_argument("--mute-audio")  # Silencia cualquier posible sonido
options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

service = Service()
driver = webdriver.Chrome(service=service)
driver.get(url)

time.sleep(2)
try:
    driver.find_element(By.XPATH, "//*[@id='onetrust-accept-btn-handler']").click()
except Exception:
    pass
time.sleep(1)
try:
    driver.find_element(By.XPATH, "//*[@data-automation='closeModal']").click()
except Exception:
    pass

soup = bs(driver.page_source, "html.parser")



### Obtención de la info básica

In [9]:
# Nombre
name = soup.find("h1").text.strip()
# Dirección
address = soup.find("span", {"data-automation" : "restaurantsMapLinkOnName"}).text.strip()
# Valoración media
rate = soup.find("div", {"data-automation" : "bubbleRatingValue"}).text.strip()
# Número de valoraciones
n_ratings = soup.find("div", {"data-automation" : "bubbleReviewCount"}).text.strip()
# Tipos de cocina y precio - OPCIÓN B
search = [res for res in soup.find_all("span") if "Restaurants in" in res.text.strip()][0]
cuisines_price = [c.text.strip() for c in search.find_next_sibling()]
cuisines_price = [c for c in cuisines_price if c != ","]
cuisines = cuisines_price[:-1]
price = cuisines_price[-1]
# Tipos de cocina, dietas y momentos de comida
search = [res for res in soup.find_all("div") if res.text.strip() == "About"][0]
try:
    about = search.find_next_sibling().text.strip()
except Exception:
    about = None
# Resumen de reseñas
try:
    search = [res for res in soup.find_all("div") if res.text.strip() == "Reviews summary"][0]
    reviews_summary = search.find_parents()[1].find_next_siblings()[1].find().text.strip()
    reviews_summary = reviews_summary.replace("Jump to all reviews", "").strip()
except Exception:
    reviews_summary = None


### Obtención info dinámica

In [10]:
driver.find_element(By.XPATH, "//*[text()='See all features']").click()
features_soup = bs(driver.page_source, "html.parser")
try:
    diets = [r for r in features_soup.find_all("div") if r.text.strip().lower() == "special diets"][0]
    diets = diets.find_next_sibling().text.strip()
except Exception:
    diets = None
try:
    cuisines = [r for r in features_soup.find_all("div") if r.text.strip().lower() == "cuisines"][0]
    cuisines = cuisines.find_next_sibling().text.strip()
except Exception:
    cuisines = None
try:
    meal_times = [r for r in features_soup.find_all("div") if r.text.strip().lower() == "meal types"][0]
    meal_times = meal_times.find_next_sibling().text.strip()
except Exception:
    meal_times = None
driver.find_element(By.XPATH, "//*[@aria-label='Close']").click()

#### Reseñas

In [11]:
for _ in range(10):
    driver.execute_script("window.scrollBy(0, 500);")
reviews_soup = bs(driver.page_source, "html.parser")

In [12]:
review_boxes = reviews_soup.find_all("div", {"data-automation": "reviewCard"})
if review_boxes:

    reviews = []

    for rev in review_boxes:
        review_rate_box = rev.find().find()
        review_rate = review_rate_box.text.strip().split(" ")[0]
        review_title_box = rev.find_all("div", {"data-test-target" : "review-title"})[0]
        review_title = review_title_box.text.strip()
        review_visit_type = review_title_box.find_next_sibling().find("span").text.strip()
        review_text_box = rev.find_all("div", {"data-test-target" : "review-body"})[0]
        review_text = review_text_box.text.strip().replace("Read more", "").strip()

        review_data = {
            "rate": review_rate,
            "title": review_title,
            "text": review_text,
            "visit_type": review_visit_type
        }

        reviews.append(review_data)

    reviews_json = json.dumps(reviews, indent=4, ensure_ascii=False)

else:
    review_boxes_container = reviews_soup.find("div", {"data-test-target" : "reviews-tab"}).find_next().find_next_siblings()[2].find_next().find_next().find_next()
    review_box_class = review_boxes_container.find_next()["class"]
    review_box_class = " ".join(review_box_class)
    review_boxes = review_boxes_container.find_all("div", class_=review_box_class)

    reviews = []

    for revibox in review_boxes:
        rev = revibox.find_next().find_next_sibling().find_all()[0]
        review_rate_box = rev.find().find()
        review_rate = review_rate_box.text.strip().split(" ")[0]
        review_title_box = review_rate_box.find_next_sibling()
        review_title = review_title_box.text.strip()
        review_text_box = review_title_box.find_next_sibling()
        review_text = review_text_box.text.strip().replace("Read more", "").strip()
        review_visit_type_box = [r for r in rev.find_all() if r.text.strip() == "Trip type"][0]
        if not review_visit_type_box:
            review_visit_type = "Not specified"
        else:
            review_visit_type = review_visit_type_box[0].find_next_sibling().text.strip()

        review_data = {
            "rate": review_rate,
            "title": review_title,
            "text": review_text,
            "visit_type": review_visit_type
        }

        reviews.append(review_data)

    reviews_json = json.dumps(reviews, indent=4, ensure_ascii=False)

IndexError: list index out of range

In [13]:
print("Nombre:", name)
print("Dirección:", address)
print("Valoración media:", rate)
print("Número de valoraciones:", n_ratings)
print("Tipos de cocina:", cuisines)
print("Precio:", price)
print("Dietas:", diets)
print("Momentos de comida:", meal_times)
print("Resumen de reseñas:", reviews_summary)
print("15 reviews más recientes:", reviews_json)


Nombre: Ristorante Pesto
Dirección: 1915 S Broad St, Philadelphia, PA 19148-2216
Valoración media: 4.9
Número de valoraciones: (4,318 reviews)
Tipos de cocina: Italian
Precio: $$ - $$$
Dietas: Vegetarian friendly, Vegan options, Gluten free options
Momentos de comida: Dinner
Resumen de reseñas: Ristorante Pesto is lauded for its commendable value, where guests savor authentic Italian dishes with substantial portions, all served with meticulous care. The homemade pasta and sauces, celebrated for their freshness and richness, have become a highlight for many.

Warm, familial service is a hallmark here, with personal touches that elevate the dining experience. The cozy and homey ambiance of the restaurant is often described as inviting, making diners feel as though they are enjoying a meal in the comfort of a loved one's home.
15 reviews más recientes: [
    {
        "rate": "Freedom462946690425",
        "title": "Delicious!!!",
        "text": "Just like in Tijuana! Excellent shepherd 

In [14]:
driver.quit()

In [None]:
review_boxes_container = reviews_soup.find("div", {"data-test-target" : "reviews-tab"}).find_next().find_next_siblings()[2].find_next().find_next().find_next()
review_box_class = review_boxes_container.find_next()["class"]
review_box_class = " ".join(review_box_class)
review_boxes = review_boxes_container.find_all("div", class_=review_box_class)

reviews = []

for revibox in review_boxes:
    rev = revibox.find_next().find_next_sibling().find_all()[0]
    review_rate_box = rev.find().find()
    review_rate = review_rate_box.text.strip().split(" ")[0]
    review_title_box = review_rate_box.find_next_sibling()
    review_title = review_title_box.text.strip()
    review_text_box = review_title_box.find_next_sibling()
    review_text = review_text_box.text.strip().replace("Read more", "").strip()
    review_visit_type_box = [r for r in rev.find_all() if r.text.strip() == "Trip type"][0]
    if not review_visit_type_box:
        review_visit_type = "Not specified"
    else:
        review_visit_type = review_visit_type_box[0].find_next_sibling().text.strip()

    review_data = {
        "rate": review_rate,
        "title": review_title,
        "text": review_text,
        "visit_type": review_visit_type
    }

    reviews.append(review_data)

reviews_json = json.dumps(reviews, indent=4, ensure_ascii=False)