# Collecting IMDb reviews data
## Import Libraries

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time

## Set Selenium WebDriver

In [2]:
options = Options()
options.add_argument("--headless")
service = Service('./chromedriver.exe')
driver = webdriver.Chrome(service=service, options=options)

## Function to Scrape Reviews

In [3]:
def scrape_reviews(movie_url):
    driver.get(movie_url)
    time.sleep(2)

    reviews = []
    
    while len(reviews) < 100:
        try:
            load_more_button = driver.find_element(By.CLASS_NAME, 'load-more-data')
            load_more_button.click()
            time.sleep(2)
            load_more_button.click()
            time.sleep(2)
            load_more_button.click()
            time.sleep(2)
        except:
            print("No more reviews to load or error with Load More button.")
            break

        # Find all review elements on the page
        review_elements = driver.find_elements(By.CLASS_NAME, 'review-container')

        for review_element in review_elements:
            # Extract the title
            try:
                title = review_element.find_element(By.CLASS_NAME, 'title').text
            except:
                title = "No title"

            # Handle spoilers and expand review content if needed
            try:
                if review_element.find_elements(By.CLASS_NAME, 'spoiler-warning'):
                    expander = review_element.find_element(By.CLASS_NAME, 'ipl-expander') 
                    expander.click()
                    time.sleep(1)
            except:
                pass

            # Extract the review text
            try:
                review_text = review_element.find_element(By.CLASS_NAME, 'content').text
            except:
                review_text = "No review text"

            # Extract the rating
            try:
                rating_element = review_element.find_element(By.CLASS_NAME, 'rating-other-user-rating')
                rating = rating_element.text
            except:
                rating = "No rating"

            # Extract the date
            try:
                date_element = review_element.find_element(By.CLASS_NAME, 'review-date')
                date = date_element.text
            except:
                date = "No date"

            # Extract the user
            try:
                user_element = review_element.find_element(By.CLASS_NAME, 'display-name-link')
                user = user_element.text
            except:
                user = "Anonymous"

            reviews.append({
                "title": title,
                "review_text": review_text,
                "rating": rating,
                "date": date,
                "user": user
            })

            if len(reviews) >= 100:
                break

    return reviews

## Scrape Data from URLs in CSV

In [4]:
movie_df = pd.read_csv('reviews-links.csv') 

In [5]:
all_reviews = []

for index, row in movie_df.iterrows():
    movie_title = row['title']
    movie_link = row['link']
    
    print(f"Scraping reviews for {movie_title}...")
    reviews = scrape_reviews(movie_link)
    
    for review in reviews:
        review['movie_title'] = movie_title
        all_reviews.append(review)

Scraping reviews for Everything Everywhere All at Once (2022)...
Scraping reviews for Top Gun: Maverick (2022)...
Scraping reviews for The Batman (2022)...
Scraping reviews for Black Panther: Wakanda Forever (2022)...
Scraping reviews for Barbie (2023)...
Scraping reviews for Oppenheimer (2023)...
Scraping reviews for Spider-Man: Across the Spider-Verse (2023)...
Scraping reviews for Killers of the Flower Moon (2023)...
Scraping reviews for Dune: Part Two (2023)...
Scraping reviews for Guardians of the Galaxy Vol. 3 (2023)...
Scraping reviews for The Godfather (1972)...
Scraping reviews for Casablanca (1942)...
Scraping reviews for Schindler's List (1993)...
Scraping reviews for Psycho (1960)...
Scraping reviews for Gone with the Wind (1939)...
Scraping reviews for Citizen Kane (1941)...
Scraping reviews for 12 Angry Men (1957)...
Scraping reviews for The Shawshank Redemption (1994)...
Scraping reviews for The Wizard of Oz (1939)...
Scraping reviews for Jaws (1975)...


## Save Reviews to CSV

In [6]:
reviews_df = pd.DataFrame(all_reviews)

In [7]:
reviews_df

Unnamed: 0,title,review_text,rating,date,user,movie_title
0,Felt Like I Was Seeing the Inside of My Own Mi...,I have trouble turning off my brain. Anxieties...,9/10,24 May 2022,evanston_dad,Everything Everywhere All at Once (2022)
1,best film of 2022,"Profoundly deep, genuinely moving, utterly hil...",9/10,2 May 2022,movieman_kev,Everything Everywhere All at Once (2022)
2,"Don't do drugs, watch this instead.",If you take drugs for the first time and imagi...,9/10,8 April 2022,AfricanBro,Everything Everywhere All at Once (2022)
3,Fantastic,"""Be kind, especially when you don't know what'...",10/10,20 April 2022,gbill-74877,Everything Everywhere All at Once (2022)
4,The most original film ever made. Period.,Everything Everywhere All At Once is even craz...,10/10,31 March 2022,benjaminskylerhill,Everything Everywhere All at Once (2022)
...,...,...,...,...,...,...
1995,Great one in the 70s,Can't believe the movie was made in 1970s. It'...,6/10,15 January 2024,DrDumb,Jaws (1975)
1996,The best film ever made.,I saw this film when I was about 8 years old. ...,10/10,26 June 1999,baumer,Jaws (1975)
1997,We're gonna need a bigger boat,"Yea, ""Jaws"" is considered a classic for many p...",7/10,9 July 2011,raulfaust,Jaws (1975)
1998,A potboiler of the 'slow-death' variety,A potboiler with grisly action scenes that bor...,5/10,15 January 2017,zafar142007,Jaws (1975)


In [8]:
reviews_df.to_csv('dataset.csv', index=False)

## Quit WebDriver

In [10]:
driver.quit()