In [1]:
#stop loading after max
import time
from bs4 import BeautifulSoup
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
import csv
import re

# Apply nest_asyncio to allow nested event loops in Jupyter
nest_asyncio.apply()

async def get_total_review_count(page):
    try:
        # Get the page content and parse it with BeautifulSoup
        page_source = await page.content()
        soup = BeautifulSoup(page_source, 'html.parser')

        # Use the provided CSS selector to extract the desired element
        element = soup.select_one('body > main > div > div > div.pull-left.bodyLeft > div.ur-mct.rcat > div.col-lg-4.col-md-4.col-sm-4.col-xs-12 > div > div > div.clr-bl.ur-rc > div.fnt-12.clr.clr-sry.pull-left')

        # Extract the text content, if the element exists
        if element:
            text = element.get_text(strip=True)
            # Use regex to find the number of reviews in the format "Based on X reviews"
            match = re.search(r'Based on (\d+) reviews', text)
            if match:
                return int(match.group(1))  # Return the number of reviews as an integer
            else:
                print("Number of reviews not found in the text.")
                return None
        else:
            print("Element not found.")
            return None
    except Exception as e:
        print(f"An error occurred while fetching total reviews: {e}")
        return None

# Modified function to click the 'Load More Reviews' button until all reviews are loaded
async def click_load_more_review_button(page):
    # Get the total number of reviews from the specified element
    total_reviews = await get_total_review_count(page)
    if total_reviews is None:
        print("Could not fetch the total number of reviews. Exiting.")
        return

    print(f"Total reviews to load: {total_reviews}")

    while True:
        try:
            # Get the current number of reviews displayed inside #userReviews8
            review_blocks = await page.query_selector_all('#userReviews8 [id^="overflow_hidden_"]')
            current_reviews_count = len(review_blocks)
            print(f"Current reviews displayed: {current_reviews_count}")

            # Stop if the current number of reviews equals or exceeds the total reviews
            if current_reviews_count >= total_reviews:
                print("All reviews are loaded. Exiting the function.")
                break

            # Check if the 'Load More Reviews' button inside #userReviews8 is visible
            load_more_button = await page.query_selector('#loadMore8')  # Updated selector for Load More button
            if load_more_button:
                # Scroll to the button and click it
                await load_more_button.scroll_into_view_if_needed()
                await load_more_button.click()
                print("Clicked on the 'Load More Reviews' button.")
                
                # Wait for the content to load after clicking the button
                await asyncio.sleep(0.5)
            else:
                print("No more 'Load More Reviews' button found. Exiting the function.")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

async def get_reviews(page):
    # Get the page content
    page_source = await page.content()
    soup = BeautifulSoup(page_source, 'html.parser')

    # Initialize a list to store all reviews
    all_reviews = []

    # Select the review blocks
    review_blocks = soup.select('#userReviews8 [id^="overflow_hidden_"]')

    for block in review_blocks:
        # Initialize a dictionary to store the review data
        review_data = {}

        # Handle multiple selectors for 'Top ZW Voice'
        top_zw_voice = block.select_one('div > div.col-sm-2.nc-ndc.remove-clr > div.ndc-mr > span.fnt-12.clr-sry') or \
                       block.select_one('div > div.col-sm-2.nc-ndc.remove-clr > div.ndc-mr.pt-10 > span.fnt-12.clr-sry')

        badge = block.select_one('div > div.col-sm-2.nc-ndc.remove-clr > div.ndc-mr > div > span > span')

        # Handle both possible selectors for the title
        title = block.select_one('div > div.col-sm-10.col-xs-12 > div > div.f-rv-des.mb-10.clr-bl > div.row.clr > div.col-sm-10.col-xs-10 > p') or \
                block.select_one('div > div.col-sm-10.col-xs-12 > div > div.row.clr > div.col-sm-10.col-xs-10 > p')

        # Handle both possible selectors for review text
        review_text = block.select_one('div > div.col-sm-10.col-xs-12 > div > div.f-rv-des.mb-10.clr-bl > div.read-more.ht-4lines.rm > p') or \
                      block.select_one('div > div.col-sm-10.col-xs-12 > div > div.read-more > p')

        likes = block.select_one('[id^="review_"]')

        # Capture star rating
        star_rating = block.select_one('div > div.col-sm-10.col-xs-12 > div > div.f-rv-des.mb-10.clr-bl > div.row.clr > div.col-sm-2.col-xs-2.text-right.pl-0.pr-0 > span > span')

        # Store the extracted data in the dictionary
        review_data['Top_ZW_Voice'] = top_zw_voice.get_text(strip=True) if top_zw_voice else None
        review_data['Badge']        = badge.get_text(strip=True) if badge else None
        review_data['Title']        = title.get_text(strip=True) if title else None
        review_data['Review_Text']  = review_text.get_text(strip=True) if review_text else None
        review_data['Likes']        = likes.get_text(strip=True) if likes else None
        review_data['Star_Rating']  = star_rating.get_text(strip=True) if star_rating else None

        # Append the review data to the list of all reviews
        all_reviews.append(review_data)

    return all_reviews

# Function to save reviews to a CSV file
def save_reviews_to_csv(reviews, file_name):
    # Define the headers for the CSV file (adjust based on your review components)
    headers = ['Top_ZW_Voice', 'Badge', 'Title', 'Review_Text', 'Likes', 'Star_Rating']
    
    # Check if the file is empty to write the headers
    file_empty = False
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            if file.read(1):
                file_empty = False
            else:
                file_empty = True
    except FileNotFoundError:
        file_empty = True

    # Open the file in append mode
    with open(file_name, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)

        # Write headers only if the file is new or empty
        if file_empty:
            writer.writeheader()

        # Write each review (dictionary) to the CSV
        for review in reviews:
            writer.writerow(review)            

# Main function to scrape reviews and save them to CSV
async def scrape_reviews(url, bike_name, file_name):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Load the webpage
        await page.goto(url)

        # Click the 'Load More Reviews' button until all reviews are loaded
        await click_load_more_review_button(page)

        # Extract reviews and save them after each page load
        all_reviews = await get_reviews(page)
        
        # Save the reviews to CSV
        if all_reviews:
            save_reviews_to_csv(all_reviews, file_name)

        print(f"Reviews saved to {file_name}.")
        
        # Close the browser
        await browser.close()

# Define the URL, bike name, and CSV file name
url = 'https://www.zigwheels.com/user-reviews/bajaj/avenger-220-street'
bike_name = 'Bajaj Avenger 220 Street'
file_name = f"{bike_name.replace(' ', '_').lower()}_reviews.csv"

# Run the async function directly
await scrape_reviews(url, bike_name, file_name)

The chromedriver version (114.0.5735.90) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (129.0.6668.70); currently, chromedriver 129.0.6668.70 is recommended for chrome 129.*, so it is advised to delete the driver in PATH and retry


WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
  (unknown error: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /usr/bin/google-chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x555a921864e3 <unknown>
#1 0x555a91eb5c76 <unknown>
#2 0x555a91eded78 <unknown>
#3 0x555a91edb029 <unknown>
#4 0x555a91f19ccc <unknown>
#5 0x555a91f1947f <unknown>
#6 0x555a91f10de3 <unknown>
#7 0x555a91ee62dd <unknown>
#8 0x555a91ee734e <unknown>
#9 0x555a921463e4 <unknown>
#10 0x555a9214a3d7 <unknown>
#11 0x555a92154b20 <unknown>
#12 0x555a9214b023 <unknown>
#13 0x555a921191aa <unknown>
#14 0x555a9216f6b8 <unknown>
#15 0x555a9216f847 <unknown>
#16 0x555a9217f243 <unknown>
#17 0x7f487e4e6609 start_thread
