In [None]:
import os
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
import time
import re

# Function to clean and collect reviews from the HTML
def clean_reviews(html_text):
    script_tags = html_text.find_all('script', type='application/ld+json')
    if len(script_tags) < 2:
        return []

    reviews_json = script_tags[1].string
    if not reviews_json:
        return []

    try:
        reviews_data = json.loads(reviews_json)
        reviews = reviews_data.get('review', reviews_data.get('reviews', []))
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        return []

    data = []
    for review in reviews:
        try:
            author = review['author']['name'] if isinstance(review['author'], dict) else review['author']
            data.append({
                'author': author,
                'review': review['description'],
                'rating': review['reviewRating']['ratingValue']
            })
        except KeyError as e:
            print(f"Key error: {e}")
            continue
    return data

# Function to save DataFrame to CSV
def save_df(file_name, df):
    # Sanitize file_name by removing invalid characters
    file_name = re.sub(r'[\\/*?:"<>|]', "", file_name)
    if not os.path.exists("Reviews"):
        os.makedirs("Reviews")
    df.to_csv(f"Reviews/{file_name}.csv", index=False)

# Function to scrape reviews using requests
def get_reviews(url, max_reviews, sort='popular'):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/83.0.4103.97 Safari/537.36'
    }

    sort_option = '&sort=rd' if sort == 'popular' else '&sort=dd'
    
    reviews = []
    seen_reviews = set()
    page_number = 1
    restaurant_name = ""

    retries = 3  # Number of retries
    timeout = 10  # Timeout in seconds

    while len(reviews) < max_reviews:
        page_url = f"{url}?page={page_number}{sort_option}"
        
        for attempt in range(retries):
            try:
                response = requests.get(page_url, headers=headers, timeout=timeout)
                response.raise_for_status()
                break  # If the request was successful, break out of the retry loop
            except requests.RequestException as e:
                print(f"Request failed (attempt {attempt+1} of {retries}): {e}")
                if attempt == retries - 1:
                    return pd.DataFrame(reviews, columns=['author', 'review', 'rating'])
                time.sleep(5)  # Wait for 5 seconds before retrying

        html_text = BeautifulSoup(response.text, 'html.parser')
        title_tag = html_text.head.find('title')
        if title_tag and not restaurant_name:
            restaurant_name = title_tag.text.split("|")[0].strip()

        data = clean_reviews(html_text)
        if not data:
            break

        # Filter out duplicate reviews
        unique_data = [review for review in data if review['review'] not in seen_reviews]
        seen_reviews.update(review['review'] for review in unique_data)
        if not unique_data:
            break  # Stop if no new unique reviews are found

        reviews.extend(unique_data)
        page_number += 1

    if not restaurant_name:
        restaurant_name = "Restaurant_Reviews"
    
    review_df = pd.DataFrame(reviews[:max_reviews])

    save_df(restaurant_name, review_df)

    return review_df

# Main scraping function using Selenium
def scrape_zomato_reviews(url):
    # Initialize lists to store data
    all_rest_name = []
    all_ratings = []
    all_reviews = []

    # Set up Selenium WebDriver
    driver = webdriver.Chrome()

    # Navigate to the restaurant link
    driver.get(url)
    time.sleep(2)

    scroll_pause_time = 1.8
    screen_height = driver.execute_script("return window.screen.height;")
    i = 1

    while True:
        driver.execute_script("window.scrollTo(0, {0});".format(screen_height * i))
        i += 1
        time.sleep(scroll_pause_time)
        scroll_height = driver.execute_script("return document.body.scrollHeight;")
        if (screen_height) * i > scroll_height:
            break

    # Create a soup object
    soup = BeautifulSoup(driver.page_source, "html.parser")
    divs = soup.findAll('div', class_='jumbo-tracker')

    # Loop through restaurant divs and extract data
    for parent in divs:
        name_tag = parent.find("h4")
        if name_tag is not None:  # Check if the tag exists
            rest_name = name_tag.text

            link_tag = parent.find("a")
            base = "https://www.zomato.com"
            rest_link = urljoin(base, link_tag.get('href'))

            rating_tag = parent.div.a.next_sibling.div.div.div.div.div.div.div.text

            all_rest_name.append(rest_name)
            all_ratings.append(rating_tag)

            # Navigate to the restaurant page
            driver.get(rest_link)
            time.sleep(2)

            # Create a soup object for the restaurant page
            rest_soup = BeautifulSoup(driver.page_source, "html.parser")

            # Find and click the reviews link to navigate to the reviews section
            review_link_tag = rest_soup.find('a', string='Reviews')
            if review_link_tag:
                review_link = urljoin(base, review_link_tag['href'])

                # Scrape reviews using requests
                reviews_df = get_reviews(review_link, 500, sort='new')
                all_reviews.append(reviews_df)

    # Create a DataFrame
    df = pd.DataFrame({
        'names': all_rest_name,
        'ratings': all_ratings,
        'reviews': all_reviews
    })

    # Save restaurant data to a CSV file
    df.to_csv("restaurant_data_with_reviews_hyderabad.csv", index=False)

    # Close the WebDriver
    driver.close()

if __name__ == "__main__":
    scrape_zomato_reviews("https://www.zomato.com/hyderabad/somajiguda-restaurants")
