In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_tripadvisor_reviews(start_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/"
    }

    current_page = 0
    all_reviews = []  # List to store all scraped reviews
    hotel_name = ''
    hotel_location = ''

    while True:
        # Construct the URL for each page based on TripAdvisor's pagination structure
        if current_page == 0:
            url = start_url  # First page URL
        else:
            url = start_url.replace("-Reviews-", f"-Reviews-or{current_page * 10}-")

        print(f"Fetching page {current_page + 1}: {url}")

        # Send a GET request to fetch the webpage
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            break

        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract hotel name and location (only once on the first page)
        if current_page == 0:
            hotel_name_element = soup.find('h1', id='HEADING')
            hotel_name = hotel_name_element.text if hotel_name_element else "Unknown Hotel"

            hotel_location_element = soup.find('span', class_='fRIUK CdhWK _S')
            hotel_location = hotel_location_element.text.strip() if hotel_location_element else "Unknown Location"

        # Find review containers
        review_divs = soup.find_all('div', class_='kmMXA _T Gi')

        if not review_divs:
            print("No more reviews found. Stopping pagination.")
            break

        # Extract review data (title, content, reviewer, date, rating)
        review_titles = [i.text for i in soup.find_all('span', class_='JbGkU Cj')]
        review_contents = [i.text for i in soup.find_all('span', class_='orRIx Ci _a C')]
        reviewers_dates = [i.text for i in soup.find_all('div', class_='tVWyV _Z o S4 H3 Ci') if 'wrote a review' in i.text]

        # Split the reviewers and dates
        reviewers = [i.split('wrote a review ')[0] for i in reviewers_dates]
        review_dates = [i.split('wrote a review ')[1] for i in reviewers_dates]
        ratings = [i.find('title').text[0:3] for i in review_divs]

        # Store reviews data
        for i in range(len(review_titles)):
            all_reviews.append([
                hotel_name,
                hotel_location,
                reviewers[i],
                ratings[i],
                review_dates[i],
                review_titles[i],
                review_contents[i]
            ])

        # Update the page counter for pagination
        current_page += 1

        # Delay to prevent overwhelming the server
        time.sleep(2)

    # Create a Pandas DataFrame from the collected reviews
    df = pd.DataFrame(all_reviews, columns=[
        'Hotel Name', 'Hotel Location', 'Reviewer', 'Rating', 'Date of Review', 'Review Title', 'Complete Review'
    ])

    # Save the DataFrame to an Excel file named after the hotel
    excel_file_name = f"{hotel_name.replace(' ', '_')}.xlsx"
    df.to_excel(excel_file_name, index=False)
    print(f"Reviews saved to {excel_file_name}")

# List of URLs to scrape
urls = [



"https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html",
"https://www.tripadvisor.in/Hotel_Review-g298561-d302394-Reviews-RIHGA_Royal_Hotel_Hiroshima-Hiroshima_Hiroshima_Prefecture_Chugoku.html",
"https://www.tripadvisor.in/Hotel_Review-g298564-d1071189-Reviews-Hotel_Granvia_Kyoto-Kyoto_Kyoto_Prefecture_Kinki.html",
"https://www.tripadvisor.in/Hotel_Review-g1066457-d658576-Reviews-Sotetsu_Fresa_Inn_Higashi_Shinjuku-Shinjuku_Tokyo_Tokyo_Prefecture_Kanto.html",# Add more URLs as needed...
]

# Loop through all URLs and scrape reviews for each hotel
for url in urls:
    scrape_tripadvisor_reviews(url)
    print(f"Finished scraping reviews for {url}. Moving on to the next URL...")

Fetching page 1: https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html
Fetching page 2: https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-or10-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html
Fetching page 3: https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-or20-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html
Fetching page 4: https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-or30-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html
Fetching page 5: https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-or40-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html
Fetching page 6: https://www.tripadvisor.in/Hotel_Review-g298564-d10782735-Reviews-or50-Daiwa_Roynet_Hotel_Kyoto_Ekimae_PREMIER-Kyoto_Kyoto_Prefecture_Kinki.html
Fetching page 7: https://www.trip