In [2]:
import re
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Function to extract star rating from the review_div
def extract_star_rating_from_review(review_div):
    rating_div = review_div.find('div', class_='rating')
    if rating_div:
        star_span = rating_div.find('span', recursive=False)
        if star_span:
            rated_stars = star_span.find_all('i', class_='icon-rating rated-star')
            star_rating = len(rated_stars)
        else:
            star_rating = 0
    else:
        star_rating = 0
    return star_rating

# Function to extract review text from the review_div
def extract_review_text(review_div):
    review_element = review_div.select_one("div.more.reviewdata > p")
    if review_element:
        review_text = review_element.get_text(strip=True)
    else:
        review_text = "No review text available."
    return review_text

# Function to extract date and time from the review_div
def extract_review_datetime(review_div):
    datetime_element = review_div.find('span', id=re.compile(r'^rptreviews_ctl\d+_lblDateTime$'))
    if datetime_element:
        review_datetime = datetime_element.get_text(strip=True)
    else:
        review_datetime = "No date and time available."
    return review_datetime

# Function to extract location from the review_div
def extract_location(review_div):
    location_element = review_div.select_one("#rptreviews_ctl00_divProfile > div.usr-addr-text")
    if location_element:
        location = location_element.get_text(strip=True)
    else:
        location = "No location available."
    return location

# Function to extract likes from the review_div
def extract_likes(review_div):
    likes_element = review_div.select_one("#rptreviews_ctl00_divlike > a")
    if likes_element:
        likes_text = likes_element.get_text(strip=True)
        likes = re.search(r'\d+', likes_text)
        likes = likes.group() if likes else "0"
    else:
        likes = "0"
    return int(likes)

# Function to extract comments from the review_div
def extract_comments(review_div):
    comments_element = review_div.select_one("#rptreviews_ctl02_commentspan")
    if comments_element:
        comments_text = comments_element.get_text(strip=True)
        comments = re.search(r'\d+', comments_text)
        comments = comments.group() if comments else "0"
    else:
        comments = "0"
    return int(comments)

# Function to extract fake status from the review_div
def extract_fake_status(review_div):
    fake_element = review_div.select_one("#rptreviews_ctl00_commentspan")
    if fake_element:
        fake_status = fake_element.get_text(strip=True)
    else:
        fake_status = "No fake status available."
    return fake_status

# Function to extract the domain name (website) from a URL
def extract_website(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Function to read the last page number
async def get_last_page_number(page):
    try:
        last_page_selector = "#spnPaging > li:nth-child(12) > a"
        last_page_element = await page.query_selector(last_page_selector)
        if last_page_element:
            last_page_text = await last_page_element.inner_text()
            return int(last_page_text.strip())
        else:
            return 1  # Fallback to 1 if last page is not found
    except Exception as e:
        print(f"Error finding last page number: {e}")
        return 1

# Main extraction function without relying on the "Next" button
async def extract_reviews(page, base_url, last_page):
    review_count = 0
    all_reviews = []

    for page_number in range(1, last_page + 1):
        try:
            # Construct the URL for the current page
            current_url = f"{base_url}-page-{page_number}"
            print(f"Scraping page {page_number} from URL: {current_url}...")

            await page.goto(current_url)
            page_source = await page.content()
            soup = BeautifulSoup(page_source, 'html.parser')

            # Find all review divs with ids matching the pattern
            pattern     = re.compile(r'^rptreviews_ctl\d+_lireviewdetails$')
            review_divs = soup.find_all('div', id=pattern)

            # Extract data from each review
            for review_div in review_divs:
                review_text     = extract_review_text(review_div)
                star_rating     = extract_star_rating_from_review(review_div)
                review_datetime = extract_review_datetime(review_div)
                location        = extract_location(review_div)
                likes           = extract_likes(review_div)
                comments        = extract_comments(review_div)
                fake_status     = extract_fake_status(review_div)

                all_reviews.append({
                    'Rating'       : star_rating,
                    'Review Text'  : review_text,
                    'Date and Time': review_datetime,
                    'Location'     : location,
                    'Likes'        : likes,
                    'Comments'     : comments,
                    'Fake Status'  : fake_status
                })

            review_count += len(review_divs)

            if page_number == last_page:
                print(f"Reached the last page: {last_page}. Exiting.")
                break

        except Exception as e:
            print(f"Error occurred during review extraction: {e}")
            break

    return all_reviews

# Updated run_scraper to pass the current URL
async def run_scraper():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Load the CSV file with the links
        file_path = 'extracted_links.csv'
        extracted_links_df = pd.read_csv(file_path)
        print(extracted_links_df)

        all_data = []
        for index, row in extracted_links_df.iterrows():
            url = row['Link']
            await page.goto(url)

            # Get the last page number
            last_page = await get_last_page_number(page)
            print(f"Last page number: {last_page}")

            # Extract the base URL (without the page number)
            base_url = url.rsplit('-', 1)[0]

            # Extract reviews until the last page
            extracted_data = await extract_reviews(page, base_url, last_page)

            # Extract the website from the URL
            website = extract_website(url)

            # Add the product name and website to each review
            for data in extracted_data:
                data['Product'] = row['Text']
                data['Website'] = website

            all_data.extend(extracted_data)

        # Convert to DataFrame
        reviews_df = pd.DataFrame(all_data)

        # Save the reviews to a CSV file
        reviews_df.to_csv('extracted_reviews.csv', index=False)

        await browser.close()
        print("Reviews extraction complete. Data saved to 'extracted_reviews.csv'.")

# For Jupyter or environments with a running event loop
await run_scraper()

            Text                                               Link
0  bajaj_avenger  https://www.mouthshut.com/bikes/bajaj-avenger-...
Last page number: 148
Scraping page 1 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-1...
Scraping page 2 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-2...
Scraping page 3 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-3...
Scraping page 4 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-4...
Scraping page 5 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-5...
Scraping page 6 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-6...
Scraping page 7 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-7...
Scraping page 8 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-8...
Scraping page 9 from URL: https://www.mouthshut.com/bikes/bajaj-avenger-reviews-page-9...


CancelledError: 