In [5]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from IPython import get_ipython
import nest_asyncio
import re

nest_asyncio.apply()

async def click_next_button(page, max_reviews=9999999):
    review_count = 0
    extracted_links = []

    while review_count <= max_reviews:
        try:
            # Get the page content and parse it with BeautifulSoup
            page_content = await page.content()
            soup = BeautifulSoup(page_content, 'html.parser')

            # Extract the reviews on the current page
            review_elements = soup.select(
                "#rf01 > div.app-content > div > div:nth-child(1) > main > div > div.gsc_col-xs-12.gsc_col-sm-12.gsc_col-md-8.gsc_col-lg-9 > section.clearfix.ReadReview.shadow24.marginBottom20 > div > div.gsc-ta-active.gsc-ta-content > ul > li"
            )
            
            review_count += len(review_elements)
            print(f"Total reviews loaded: {review_count}")

            # Extract specific text from each review
            for review in review_elements:
                # Extract the rating text
                rating_element = review.select_one("div > div > div.authorInfo.authordetail > div.authorSummary > span > span.ratingStarNew")
                if rating_element:
                    rating_text = rating_element.get_text(strip=True)
                    print(f"Extracted rating: {rating_text}")
                
                # Extract the text within 'contentspace > span'
                contentspace_span = review.select_one("div > div > div.contentspace > span")
                if contentspace_span:
                    contentspace_span_text = contentspace_span.get_text(strip=True)
                    print(f"Extracted contentspace span text: {contentspace_span_text}")

                # Extract the text within 'contentspace > div'
                contentspace_div = review.select_one("div > div > div.contentspace > div")
                if contentspace_div:
                    contentspace_div_text = contentspace_div.get_text(strip=True)
                    print(f"Extracted contentspace div text: {contentspace_div_text}")
                
                # Extract additional authorSummary div content and capture only the date
                author_summary_div = review.select_one("div > div > div.authorInfo.authordetail > div.authorSummary > div")
                if author_summary_div:
                    author_summary_text = author_summary_div.get_text(strip=True)
                    print(author_summary_text)
                    # date_match = re.search(r'\b\w{3} \d{1,2}, \d{4}\b', author_summary_text)
                    # if date_match:
                    #     date_text = date_match.group(0)
                    #     print(f"Extracted date: {date_text}")

            # Check if the 'Next' button is available and click it
            next_button = await page.query_selector(
                "#rf01 > div.app-content > div > div:nth-child(1) > main > div > div.gsc_col-xs-12.gsc_col-sm-12.gsc_col-md-8.gsc_col-lg-9 > section.clearfix.ReadReview.shadow24.marginBottom20 > div > div.marginTop20 > div > div > div > ul > li:nth-child(9) > span"
            )

            if next_button:
                await next_button.click()
                print("Clicked on the 'Next' button.")
                # Wait for the content to load after clicking the button
                await page.wait_for_timeout(2000)  # Adjust the wait time if necessary
            else:
                print("No more 'Next' button found. Exiting the function.")
                break

        except Exception as e:
            print(f"An error occurred: {e}. Exiting the function.")
            break

    print(f"Extracted {len(extracted_links)} links:")
    for link in extracted_links:
        print(link)

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        # Load the webpage
        url = 'https://www.bikedekho.com/bajaj/pulsar-ns-125/reviews'
        await page.goto(url)

        # Click the 'Next' button until the desired number of reviews are loaded
        await click_next_button(page)

        # Keep the browser open after the script is done
        print("Finished navigating through the reviews and extracting them. The browser will remain open.")
        await page.wait_for_timeout(60000)  # Keeps the browser open for 60 seconds

# Check if running in IPython (e.g., Jupyter Notebook) and use appropriate event loop
if __name__ == "__main__":
    asyncio.run(main())


Total reviews loaded: 30
Extracted rating: 4.7
Extracted contentspace span text: This bike is Valuable amount
Extracted contentspace div text: I have this bike really Super fantastic bike ❤️ 
And this bike is for middle class family.
For us this is a super bike
Extracted rating: 4.2
Extracted contentspace span text: My favourite bike
Extracted contentspace div text: My favourite bike is ns 125. In this bike has superbike look and sames as power and milage also is very high
Extracted rating: 4.3
Extracted contentspace span text: Overview of ns 160 build quality is good
Extracted contentspace div text: Excellent comfort and good build quality riding smoothness good control stability very happy with this product safe and good engine
Extracted rating: 5.0
Extracted contentspace span text: Fantastic rider
Extracted contentspace div text: This bike is fantastic riding bike is so good his speed that are looking very good seeing gorgeous look and good reviews
Extracted rating: 4.7
Extracted co

KeyboardInterrupt: 