In [5]:



import re
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Function to extract star rating from the review_div
def extract_star_rating_from_review(review_div):
    rating_div = review_div.find('div', class_='rating')
    if rating_div:
        star_span = rating_div.find('span', recursive=False)
        if star_span:
            rated_stars = star_span.find_all('i', class_='icon-rating rated-star')
            star_rating = len(rated_stars)
        else:
            star_rating = 0
    else:
        star_rating = 0
    return star_rating

# Function to extract review text from the review_div
def extract_review_text(review_div):
    review_element = review_div.select_one("div.more.reviewdata > p")
    if review_element:
        review_text = review_element.get_text(strip=True)
    else:
        review_text = "No review text available."
    return review_text

# Function to extract date and time from the review_div
def extract_review_datetime(review_div):
    datetime_element = review_div.find('span', id=re.compile(r'^rptreviews_ctl\d+_lblDateTime$'))
    if datetime_element:
        review_datetime = datetime_element.get_text(strip=True)
    else:
        review_datetime = "No date and time available."
    return review_datetime

# Function to extract the domain name (website) from a URL
def extract_website(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Main extraction function with improved "Next" button handling
async def extract_reviews(page):
    review_count = 0
    page_number = 1  # Initialize page number
    max_reviews = 9999999  # Set your desired number of reviews here
    all_reviews = []

    while True:
        print(f"Scraping page {page_number}...")  # Print the current page number
        
        page_source = await page.content()
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all review divs with ids matching the pattern
        pattern = re.compile(r'^rptreviews_ctl\d+_lireviewdetails$')
        review_divs = soup.find_all('div', id=pattern)

        # Extract data from each review
        for review_div in review_divs:
            review_text     = extract_review_text(review_div)
            star_rating     = extract_star_rating_from_review(review_div)
            review_datetime = extract_review_datetime(review_div)

            all_reviews.append({
                'Rating'       : star_rating,
                'Review Text'  : review_text,
                'Date and Time': review_datetime
            })

        review_count += len(review_divs)

        # Stop if review count exceeds max_reviews
        if review_count >= max_reviews:
            print("Desired number of reviews loaded. Exiting.")
            break

        # Check for the 'Next' button
        next_button = await page.query_selector("#litPages > ul > li.next > a")  # Updated selector
        
        # Check if the button exists and is not disabled
        if next_button:
            # Check if the button is disabled (sometimes buttons are disabled but visible)
            next_button_disabled = await next_button.get_attribute('class')
            if 'disabled' in next_button_disabled:  # Adjust this if the website uses another marker
                print("Next button is disabled. No more pages to scrape.")
                break

            await next_button.click()
            await page.wait_for_timeout(2000)  # Adjust wait time if necessary
            page_number += 1  # Increment page number after clicking the "Next" button
        else:
            print("Next button not found, or no more pages. Exiting.")
            break

    return all_reviews

# Function to handle scraping logic
async def run_scraper():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Load the CSV file with the links
        file_path = 'extracted_links.csv'
        extracted_links_df = pd.read_csv(file_path)
        print(extracted_links_df)

        all_data = []
        for index, row in extracted_links_df.iterrows():
            url = row['Link']
            # print(row['Link'])
            await page.goto(url)

            # Extract reviews
            extracted_data = await extract_reviews(page)

            # Extract the website from the URL
            website = extract_website(url)

            # Add the product name and website to each review
            for data in extracted_data:
                data['Product'] = row['Text']
                data['Website'] = website

            all_data.extend(extracted_data)

        # Convert to DataFrame
        reviews_df = pd.DataFrame(all_data)

        # Save the reviews to a CSV file
        reviews_df.to_csv('extracted_reviews.csv', index=False)

        await browser.close()
        print("Reviews extraction complete. Data saved to 'extracted_reviews.csv'.")

# For Jupyter or environments with a running event loop
await run_scraper()

            Text                                               Link
0  bajaj_avenger  https://www.mouthshut.com/bikes/bajaj-avenger-...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Next button not found, or no more pages. Exiting.
Reviews extraction complete. Data saved to 'extracted_reviews.csv'.
