### Libraries used:

- Selenium
- BeautifulSoup
- Pandas
- Time
- Sys
- Random

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import sys
import random
from selenium.webdriver.chrome.options import Options

Code to setup the driver

In [3]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
url = 'https://www.tripadvisor.in'
driver.get(url) 



Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/100.0.4896.60/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\siddh\.wdm\drivers\chromedriver\win32\100.0.4896.60]


### Functions

In [4]:
def start_scraping():
    other_count = int(input("How many hotels to show? (max : 7)"))
    option_string, hotel_name = search_hotel(other_count)
    print(option_string)
    sys.stdout.flush()
    home_page = driver.window_handles[0]
    option = int(input("Choose a hotel to scrape reviews of: "))
    scrape_hotel(hotel_name, option, home_page)
    print(f"saved as {hotel_name}" + ".csv")

In [5]:
def search_hotel(other_count):
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located(
            (By.CLASS_NAME, 'bmTdH')
        )
    )
    hotel_name = input("Enter a hotel name")

    search_box = driver.find_elements(
        By.CLASS_NAME, 'bmTdH'
    )

    search_box[1].click()
    input_box = search_box[1].find_element(By.NAME, 'q')

    input_box.send_keys(hotel_name)
    input_box.send_keys(Keys.ENTER)

    WebDriverWait(driver, 30).until(
        EC.presence_of_all_elements_located(
            (By.CLASS_NAME, 'result-title')
        )
    )

    option_string = get_search_details(other_count)
    return option_string, hotel_name

In [6]:
def next_page(home_page, soup, rand_delay):
    next_btn = soup.find_all('a', {"class": 'next'})

    if next_btn:
        
        WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable(
                (By.CLASS_NAME, 'next')
            )
        ).click()

        time.sleep(rand_delay)   

        WebDriverWait(driver, 30).until(
            EC.visibility_of_all_elements_located(
                (By.CLASS_NAME, 'XllAv')
            )
        )

        return 1

    else:
        driver.close()
        #driver.switch_to.window(home_page)
        return 0

In [7]:
def get_hotel_names(soup):
    title_tags = soup.find_all(
        'div', {"class": "result-title"}
    )
    titles = []
    for tag in title_tags:
        titles.append(tag.span.text.strip())
        
    return titles

In [8]:
def get_review_titles(soup):

    review_title_tags = soup.find_all('div', {"class": 'fpMxB MC _S b S6 H5 _a'})
    review_titles = []
    for tag in review_title_tags:
        review_titles.append(tag.a.span.text.strip())
        
    return review_titles

In [9]:
def get_full_reviews(soup):
    
    review_description_tags = soup.find_all('q', {"class": "XllAv H4 _a"})
    review_descriptions = []
    for tag in review_description_tags:
        review_descriptions.append(tag.span.text.strip())

    return review_descriptions

In [10]:
def get_review_stars(soup):
    
    star_tags = soup.find_all('div', {"class": "emWez"})
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    stars = []

    for tag in star_tags:
        star = int(tag.span['class'][1].split('_')[1])/10
        stars.append(star)

    return stars

In [11]:
def get_review_dates(soup):
    date_tags = soup.find_all('span', {"class": "euPKI"})
    dates = []
    for tag in date_tags:
        date = tag.text.strip()
        dates.append(date)
    return dates

In [12]:
def get_hotel_addresses(soup):
    address_tags = soup.find_all(
        'div', {"class": "address-text"}
    )
    addresses = []
    for tag in address_tags:
        addresses.append(tag.text.strip())
        
    return addresses

In [13]:
def get_search_details(other_count):
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    titles = get_hotel_names(soup)
    addresses = get_hotel_addresses(soup)
    ratings = get_ratings(soup)
    n_reviews = get_n_reviews(soup)

    option_string = ""

    for i in range(other_count):
        option_string+=f"""
            {i+1} Hotel name:  {titles[i]}
                  Address: {addresses[i]}
                  Rating: {ratings[i]}
                  Number of reviews: {n_reviews[i]}
            ----------------------------------------------------------------------"""
    
    return option_string

In [14]:
def get_ratings(soup):
    rating_tags = soup.find_all(
        'span', {"class": "ui_bubble_rating"}
    )
    ratings = []

    for tag in rating_tags:
        rating = tag['alt'].split('b')[0].strip().replace(' of ', '/')
        ratings.append(rating)
    
    return ratings

In [15]:
def get_n_reviews(soup):
    review_tags = soup.find_all(
        'a', {"class": "review_count"}
    )
    n_reviews = []
    for tag in review_tags:
        n_reviews.append(tag.text.strip())
    
    return n_reviews

Driver function

In [16]:
def scrape_hotel(hotel_name, option, hotel_page):

    link_tag = driver.find_elements(
        By.CLASS_NAME, 'review_count'
    )

    link_tag[option-1].click()

    time.sleep(5)

    hotel_page = driver.window_handles[1]
    driver.switch_to.window(hotel_page)

    review_titles = []
    full_reviews = []
    review_stars = []
    dates = []
    
    i = 0
    
    while True:

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        review_titles += get_review_titles(soup)
        full_reviews += get_full_reviews(soup)
        review_stars += get_review_stars(soup)
        dates += get_review_dates(soup)
        
        if i%20 == 0:
            rand_delay = random.randint(2, 5)
        
        n = next_page(hotel_page, soup, rand_delay)
        if(n==0):
            break
        
        i = i + 1

    review_dict = {
        "review_title": review_titles,
        "stars": review_stars,
        "date": dates,
        "full_review": full_reviews

    }

    df = pd.DataFrame(review_dict)
    df.to_csv(hotel_name+'.csv', index=False)

In [17]:
start_scraping()

How many hotels to show? (max : 7)5
Enter a hotel nameTreebo

            1 Hotel name:  Treebo Trend Daksh Residency
                  Address: Indore, Indore District, Madhya Pradesh, India
                  Rating: 4.5/5
                  Number of reviews: 468 reviews
            ----------------------------------------------------------------------
            2 Hotel name:  Treebo Trend Shivani
                  Address: 83/2 Biyabani, Dhar Road, Indore, Indore District, Madhya Pradesh, India
                  Rating: 3.5/5
                  Number of reviews: 74 reviews
            ----------------------------------------------------------------------
            3 Hotel name:  Treebo Trend Cherry Tree
                  Address: 2, Manoramaganj, A.B. Road, Indore, Indore District, Madhya Pradesh, India
                  Rating: 4/5
                  Number of reviews: 71 reviews
            ----------------------------------------------------------------------
            4 Hote