In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")  # Start browser maximized to avoid hidden elements
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)

# Navigate to the IMDb reviews page
driver.get("https://www.imdb.com/title/tt0111161/reviews")

reviews = []
prev_reviews_count = 0

try:
    while len(reviews) < 200:
        # Wait for the reviews to load and then scroll into view of the Load More button
        wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.review-container")))
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Sleep to ensure reviews have loaded

        # Extract reviews
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        new_reviews = [review.text.strip() for review in soup.select("div.review-container .text.show-more__control")]

        # Check if new reviews have been loaded
        if len(new_reviews) > prev_reviews_count:
            # Avoid adding duplicates by extending by the new reviews only
            reviews.extend(new_reviews[prev_reviews_count:])
            prev_reviews_count = len(new_reviews)
        else:
            print("No new reviews loaded; trying again.")
            continue
        
        # Click the "Load More" button, if there are less than 200 reviews
        if len(reviews) < 200:
            load_more_button = wait.until(EC.element_to_be_clickable((By.ID, "load-more-trigger")))
            driver.execute_script("arguments[0].click();", load_more_button)

except Exception as e:
    print(f"Error occurred: {e}")

finally:
    # Truncate the list to 200 reviews if it exceeds that number
    reviews = reviews[:200]
    # Convert to DataFrame
    reviews_df = pd.DataFrame(reviews, columns=['Review'])
    print(f"Total reviews scraped: {len(reviews_df)}")

    # Save to CSV if needed
    reviews_df.to_csv("imdb_reviews_scraped.csv", index=False)
    driver.quit()

No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
Total reviews scraped: 200
