In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time

# Function to extract star rating from the review_div
def extract_star_rating_from_review(review_div):
    """
    This function takes in the review_div, finds the div that contains the star rating,
    and counts the number of rated stars (elements with the 'icon-rating rated-star' class).
    Returns the star rating (integer). If no rating_div is found, it returns 0.
    """
    rating_div = review_div.find('div', class_='rating')
    
    if rating_div:
        star_span = rating_div.find('span', recursive=False)
        if star_span:
            rated_stars = star_span.find_all('i', class_='icon-rating rated-star')
            star_rating = len(rated_stars)
        else:
            star_rating = 0
    else:
        star_rating = 0
    
    return star_rating

# Function to extract review text from the review_div
def extract_review_text(review_div):
    """
    This function takes in the review_div and extracts the review text from the 'p' tag inside the 'div.more.reviewdata'.
    Returns the review text as a string or a default message if not available.
    """
    review_element = review_div.select_one("div.more.reviewdata > p")
    
    if review_element:
        review_text = review_element.get_text(strip=True)
    else:
        review_text = "No review text available."
    
    return review_text

# Function to extract date and time from the review_div
def extract_review_datetime(review_div):
    """
    This function takes in the review_div and extracts the date and time from the element
    with the id '#rptreviews_ctl00_lblDateTime'.
    Returns the date and time as a string or a default message if not available.
    """
    datetime_element = review_div.find('span', id=re.compile(r'^rptreviews_ctl\d+_lblDateTime$'))
    
    if datetime_element:
        review_datetime = datetime_element.get_text(strip=True)
    else:
        review_datetime = "No date and time available."
    
    return review_datetime

# Function to extract the number of likes from the review_div
def extract_review_likes(review_div):
    """
    This function takes in the review_div and extracts the number of likes from the element
    with the id matching '#rptreviews_ctl00_divlike > a'.
    Returns the number of likes as an integer or 0 if not available.
    """
    likes_element = review_div.find('a', id=re.compile(r'^rptreviews_ctl\d+_divlike$'))
    
    if likes_element:
        likes_text = likes_element.get_text(strip=True)
        likes = int(re.search(r'\d+', likes_text).group()) if re.search(r'\d+', likes_text) else 0
    else:
        likes = 0
    
    return likes

# Function to extract reviews from multiple pages
def extract(driver):
    review_count = 0
    max_reviews = 9999999  # Set your desired number of reviews here
    while review_count <= max_reviews:
        try:
            # Wait until the specific element is present
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            #----------------------------Extract----------------------------
            # Use regex to find all divs with ids matching the pattern 'rptreviews_ctl\d+_lireviewdetails'
            pattern = re.compile(r'^rptreviews_ctl\d+_lireviewdetails$')
            review_divs = soup.find_all('div', id=pattern)

            for review_div in review_divs:
                # Extract the review text
                review_text = extract_review_text(review_div)

                # Extract the star rating from the review_div
                star_rating = extract_star_rating_from_review(review_div)

                # Extract the date and time from the review_div
                review_datetime = extract_review_datetime(review_div)

                # Extract the number of likes from the review_div
                #TODO: Not working
                # review_likes = extract_review_likes(review_div)

                # Print the extracted information
                print(f"Star Rating: {star_rating}")
                print(f"Review: {review_text}")
                print(f"Date and Time: {review_datetime}")
                # print(f"Likes: {review_likes}")
                print('-' * 80)

            review_count += len(review_divs)

            #----------------------------Exit Conditions----------------------------
            # Check if the 'Next' button is available
            next_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#litPages > ul > li.next > a"))
            )

            # If review count exceeds max_reviews, break the loop
            if review_count >= max_reviews:
                print("Desired number of reviews loaded. Exiting.")
                break

            #----------------------------Go to next page----------------------------
            # Click the 'Next' button to go to the next page of reviews
            ActionChains(driver).move_to_element(next_button).click(next_button).perform()
            print("Clicked on the 'Next' button.")

            # Wait for the content to load after clicking the button
            time.sleep(2)  # Adjust the sleep time if necessary

        except Exception as e:
            print("No more 'Next' button found or an error occurred. Exiting the function.")
            print(f"Error: {e}")
            break

        print(f"Total reviews loaded: {review_count}")

# Initialize the WebDriver
driver = webdriver.Chrome()  # Ensure ChromeDriver is in your PATH

# Load the webpage
url = 'https://www.mouthshut.com/bikes/bajaj-byk-reviews-925040620'
driver.get(url)
extract(driver)

# Keep the browser open after the script is done
print("Finished navigating through the reviews and extracting them. The browser will remain open.")


Star Rating: 1
Review: This bike “Byk” was designed for comman man keeping in mind about the mileage, but they forgot about all the other major things which make this bike ineligible in aspects of appearance as well as specifications. even though it has a 100 cubic capacity engine ( 92.2 to be exact) this byk does not even run uproads. while you are with the pillion rider the engine sounds as if you have a bunch of people seated together and the engine is struggling to make the bike move.
Date and Time: Dec 21, 2017 05:30 AM
Likes: 0
--------------------------------------------------------------------------------
Star Rating: 5
Review: Think if you get a bad quality after paying a lots of money to buy any byk and after buying a big part of your earning is going  for the maintenance of that byk and with the poor average quality your days will gonna be stressful . so here is the solution for your query and that is bajaj byk with best average and reliable as far as you go with it . you ca