In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import re
import time
import pandas as pd  # Import pandas for DataFrame
from urllib.parse import urlparse

# Function to extract star rating from the review_div
def extract_star_rating_from_review(review_div):
    """
    This function takes in the review_div, finds the div that contains the star rating,
    and counts the number of rated stars (elements with the 'icon-rating rated-star' class).
    Returns the star rating (integer). If no rating_div is found, it returns 0.
    """
    rating_div = review_div.find('div', class_='rating')
    
    if rating_div:
        star_span = rating_div.find('span', recursive=False)
        if star_span:
            rated_stars = star_span.find_all('i', class_='icon-rating rated-star')
            star_rating = len(rated_stars)
        else:
            star_rating = 0
    else:
        star_rating = 0
    
    return star_rating

# Function to extract review text from the review_div
def extract_review_text(review_div):
    """
    This function takes in the review_div and extracts the review text from the 'p' tag inside the 'div.more.reviewdata'.
    Returns the review text as a string or a default message if not available.
    """
    review_element = review_div.select_one("div.more.reviewdata > p")
    
    if review_element:
        review_text = review_element.get_text(strip=True)
    else:
        review_text = "No review text available."
    
    return review_text

# Function to extract date and time from the review_div
def extract_review_datetime(review_div):
    """
    This function takes in the review_div and extracts the date and time from the element
    with the id '#rptreviews_ctl00_lblDateTime'.
    Returns the date and time as a string or a default message if not available.
    """
    datetime_element = review_div.find('span', id=re.compile(r'^rptreviews_ctl\d+_lblDateTime$'))
    
    if datetime_element:
        review_datetime = datetime_element.get_text(strip=True)
    else:
        review_datetime = "No date and time available."
    
    return review_datetime

# Function to extract the number of likes from the review_div
def extract_review_likes(review_div):
    """
    This function takes in the review_div and extracts the number of likes from the element
    with the id matching '#rptreviews_ctl00_divlike > a'.
    Returns the number of likes as an integer or 0 if not available.
    """
    likes_element = review_div.find('a', id=re.compile(r'^rptreviews_ctl\d+_divlike$'))
    
    if likes_element:
        likes_text = likes_element.get_text(strip=True)
        likes = int(re.search(r'\d+', likes_text).group()) if re.search(r'\d+', likes_text) else 0
    else:
        likes = 0
    
    return likes

# Main extraction function
def extract(driver):
    review_count = 0
    max_reviews  = 9999999  # Set your desired number of reviews here
    all_reviews  = []  # Store extracted reviews here

    #----------------------------Extract Reviews from Current Page----------------------------
    while True:
        # Wait until the page is loaded
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Use regex to find all divs with ids matching the pattern
        pattern = re.compile(r'^rptreviews_ctl\d+_lireviewdetails$')
        review_divs = soup.find_all('div', id=pattern)

        #----------------------------Extract----------------------------
        for review_div in review_divs:
            # Extract review details
            review_text = extract_review_text(review_div)
            star_rating = extract_star_rating_from_review(review_div)
            review_datetime = extract_review_datetime(review_div)
            
            # Append the extracted information as a dictionary
            all_reviews.append({
                'Rating': star_rating,
                'Review Text': review_text,
                'Date and Time': review_datetime
            })

        review_count += len(review_divs)

        #----------------------------Exit Conditions----------------------------
        # If review count exceeds max_reviews, stop extraction
        if review_count >= max_reviews:
            print("Desired number of reviews loaded. Exiting.")
            break

        # Check if the 'Next' button is available
        next_button_elements = driver.find_elements(By.CSS_SELECTOR, "#litPages > ul > li.next > a")
        if next_button_elements:
            next_button = next_button_elements[0]
            # Click the 'Next' button to go to the next page of reviews
            ActionChains(driver).move_to_element(next_button).click(next_button).perform()
            print("Clicked on the 'Next' button.")
            
            # Wait for the content to load after clicking the button
            time.sleep(2)  # Adjust if necessary
        else:
            print("Next button not found, or no more pages. Exiting.")
            break

    return all_reviews


# Function to extract the domain name (website) from a URL
def extract_website(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Initialize the WebDriver
driver = webdriver.Chrome()  # Ensure ChromeDriver is set up correctly

# Load the CSV file with the links
file_path = 'extracted_links.csv'  # Update this with the correct path
extracted_links_df = pd.read_csv(file_path)

# Iterate through each link in the extracted_links_df
all_data = []
for index, row in extracted_links_df.iterrows():
    url = row['Link']
    driver.get(url)
    
    # Extract reviews
    extracted_data = extract(driver)
    
    # Extract the website from the URL
    website = extract_website(url)
    
    # Add the product name and website to each review
    for data in extracted_data:
        data['Product'] = row['Text']
        data['Website'] = website
    
    all_data.extend(extracted_data)

# Convert to DataFrame
reviews_df = pd.DataFrame(all_data)

# Save the reviews to a CSV file
reviews_df.to_csv('extracted_reviews.csv', index=False)

# Close the WebDriver
driver.quit()

print("Reviews extraction complete. Data saved to 'extracted_reviews.csv'.")

Next button not found, or no more pages. Exiting.
Clicked on the 'Next' button.
Clicked on the 'Next' button.
Clicked on the 'Next' button.
Clicked on the 'Next' button.
Next button not found, or no more pages. Exiting.
Reviews extraction complete. Data saved to 'mouthshut_reviews_with_dynamic_website.csv'.
