# Things to fix and add

1. Filter the movies list for a random data sample side.
2. Get the URL's of Movies in a for loop in the Scraping script.
3. Divide scrapping script to 20 different files to not get IP Blocked.

# Getting a list of movies 

In [2]:
import pandas as pd

In [3]:
# Adjust these file paths according to where you've saved the downloaded datasets
basics_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb title basics.tsv'
ratings_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb Title Ratings.tsv'

In [4]:
# Reading the basics file with the correct column names
basics_df = pd.read_csv(
    basics_path, 
    sep='\t', 
    low_memory=False, 
    usecols=['tconst', 'titleType', 'primaryTitle', 'genres', 'isAdult', 'startYear', 'runtimeMinutes']
)

In [5]:
# Filtering for movies
movies_df = basics_df[basics_df['titleType'] == 'movie']

In [6]:
# Reading the ratings file
ratings_df = pd.read_csv(ratings_path, sep='\t', usecols=['tconst', 'averageRating', 'numVotes'])

# Merging the datasets on 'tconst' to combine movie details with ratings
merged_df = pd.merge(movies_df, ratings_df, on='tconst')

# Selecting and renaming the columns
IMDb_movies_df = merged_df[['primaryTitle', 'genres', 'averageRating', 'startYear', 'isAdult', 'runtimeMinutes', 'numVotes']]
IMDb_movies_df.columns = ['Movie Name', 'Genre', 'Rating', 'Release Date', 'isAdult', 'Runtime Minutes', 'numVotes']

In [7]:
# How many null values are there in each column?
print(IMDb_movies_df.isnull().sum())

Movie Name         0
Genre              0
Rating             0
Release Date       0
isAdult            0
Runtime Minutes    0
numVotes           0
dtype: int64


In [8]:
# Display the final table
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
0,Miss Jerry,Romance,5.3,1894,0,45,210
1,The Corbett-Fitzsimmons Fight,"Documentary,News,Sport",5.3,1897,0,100,499
2,Bohemios,\N,3.7,1905,0,100,17
3,The Story of the Kelly Gang,"Action,Adventure,Biography",6.0,1906,0,70,867
4,The Prodigal Son,Drama,5.0,1907,0,90,22
5,Robbery Under Arms,Drama,4.3,1907,0,\N,25
6,Hamlet,Drama,2.9,1908,0,\N,27
7,Don Quijote,Drama,4.2,1908,0,\N,20
8,The Fairylogue and Radio-Plays,"Adventure,Fantasy",5.0,1908,0,120,70
9,Faldgruben,\N,4.4,1909,0,\N,17


In [9]:
# Remove all values from Release date that are not numeric
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Release Date'].str.isnumeric()]

In [10]:
# set datatypes for IMDb_movies_df
IMDb_movies_df['Release Date'] = IMDb_movies_df['Release Date'].astype('int')
# IMDb_movies_df['isAdult'] = IMDb_movies_df['isAdult'].astype('bool')
IMDb_movies_df['Runtime Minutes'] = IMDb_movies_df['Runtime Minutes'].apply(pd.to_numeric, errors='coerce')
IMDb_movies_df['numVotes'] = IMDb_movies_df['numVotes'].apply(pd.to_numeric, errors='coerce')

In [11]:
# Filter IMDb_movies_df for Release date > 2015
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Release Date'] > 2015]

# Remove all num values from Genre column
IMDb_movies_df['Genre'] = IMDb_movies_df['Genre'].str.replace(r'\d+', '')



In [12]:
# List all unique values in the Release date column
IMDb_movies_df['Release Date'].unique()

array([2021, 2020, 2018, 2023, 2022, 2017, 2016, 2019, 2024])

In [13]:
# Remove all values from IMDb_movies_df columns that have a value /N
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Movie Name'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Rating'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['isAdult'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Runtime Minutes'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['numVotes'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Rating'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Genre'] != '\\N']

In [14]:
IMDb_movies_df.shape

(77118, 7)

In [15]:
# Display the final table
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
1792,Istoriya grazhdanskoy voyny,Documentary,6.8,2021,0,94.0,64
32409,The Tango of the Widower and Its Distorting Mi...,Drama,6.4,2020,0,70.0,186
36947,The Other Side of the Wind,Drama,6.7,2018,0,122.0,7966
37953,Socialist Realism,Drama,7.6,2023,0,78.0,53
45841,Victor Seastrom,"Biography,Documentary",6.7,2021,0,65.0,67
53443,Taxi Killer,"Action,Crime,Drama",5.6,2022,0,106.0,68
55699,The Wandering Soap Opera,"Comedy,Drama,Fantasy",6.5,2017,0,80.0,354
58266,Neues in Wittstock,Documentary,7.8,2021,0,100.0,9
61968,Bigfoot,"Horror,Thriller",4.7,2017,0,,42
66055,The Surgeon of the Rusty Knife,"Biography,Drama",7.2,2022,0,108.0,229


In [16]:
# Filter any movies with less than 10000 votes
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['numVotes'] > 10000]

In [17]:
# Display the final table
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
110436,Mortal Kombat,"Action,Adventure,Fantasy",6.0,2021,0,110.0,188147
114550,Wazir,"Action,Crime,Drama",7.1,2016,0,103.0,19816
122345,Fahrenheit 451,"Drama,Sci-Fi,Thriller",5.0,2018,0,100.0,22392
125034,American Pastoral,"Crime,Drama,Mystery",6.1,2016,0,108.0,17946
126698,Motherless Brooklyn,"Crime,Drama,Mystery",6.8,2019,0,144.0,62334
133268,Alita: Battle Angel,"Action,Adventure,Sci-Fi",7.3,2019,0,122.0,290565
133557,The Flash,"Action,Adventure,Fantasy",6.7,2023,0,144.0,201276
133848,Danger Close,"Action,Drama,War",6.8,2019,0,118.0,14990
134590,Shazam!,"Action,Adventure,Comedy",7.0,2019,0,132.0,380869
134977,Wonder Woman,"Action,Adventure,Fantasy",7.3,2017,0,141.0,694150


In [18]:
IMDb_movies_df.shape

(2667, 7)

In [19]:
# List all duplicated Movie Names in IMDb_movies_df
IMDb_movies_df[IMDb_movies_df.duplicated(subset='Movie Name', keep=False)].sort_values(by='Movie Name')

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
153836,Aftermath,"Drama,Horror,Mystery",5.3,2021,0,114.0,18577
261362,Aftermath,"Drama,Mystery,Thriller",5.7,2017,0,94.0,26633
173119,Beast,"Action,Adventure,Drama",5.6,2022,0,93.0,39471
158784,Beast,"Action,Comedy,Thriller",5.2,2022,0,155.0,36150
272316,Beast,"Crime,Drama,Mystery",6.8,2017,0,107.0,15220
150710,Black Box,"Drama,Mystery,Thriller",7.2,2021,0,129.0,14169
166199,Black Box,"Horror,Mystery,Sci-Fi",6.2,2020,0,100.0,16957
283927,Champions,"Comedy,Drama,Family",7.2,2018,0,124.0,11684
189969,Champions,"Comedy,Drama,Sport",6.8,2023,0,124.0,24312
303811,Close,Drama,7.8,2022,0,104.0,32563


In [20]:
# Remove all rows where the Movie Name, Release Date and Runtime Minutes are duplicated
IMDb_movies_df = IMDb_movies_df.drop_duplicates(subset=['Movie Name', 'Release Date', 'Runtime Minutes'])

In [21]:
# Reset index
IMDb_movies_df.reset_index(drop=True, inplace=True)

In [22]:
IMDb_movies_df.shape

(2667, 7)

In [23]:
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
0,Mortal Kombat,"Action,Adventure,Fantasy",6.0,2021,0,110.0,188147
1,Wazir,"Action,Crime,Drama",7.1,2016,0,103.0,19816
2,Fahrenheit 451,"Drama,Sci-Fi,Thriller",5.0,2018,0,100.0,22392
3,American Pastoral,"Crime,Drama,Mystery",6.1,2016,0,108.0,17946
4,Motherless Brooklyn,"Crime,Drama,Mystery",6.8,2019,0,144.0,62334
5,Alita: Battle Angel,"Action,Adventure,Sci-Fi",7.3,2019,0,122.0,290565
6,The Flash,"Action,Adventure,Fantasy",6.7,2023,0,144.0,201276
7,Danger Close,"Action,Drama,War",6.8,2019,0,118.0,14990
8,Shazam!,"Action,Adventure,Comedy",7.0,2019,0,132.0,380869
9,Wonder Woman,"Action,Adventure,Fantasy",7.3,2017,0,141.0,694150


In [24]:
# export the final table to a csv file
IMDb_movies_df.to_csv('IMDb_movies.csv', index=False)

----
### Testing the Movies dataset

In [25]:
# How many unique genres are there in the Genre column?
IMDb_movies_df['Genre'].nunique()

283

In [26]:
# List all unique values in the Genre column
IMDb_movies_df['Genre'].unique()

array(['Action,Adventure,Fantasy', 'Action,Crime,Drama',
       'Drama,Sci-Fi,Thriller', 'Crime,Drama,Mystery',
       'Action,Adventure,Sci-Fi', 'Action,Drama,War',
       'Action,Adventure,Comedy', 'Drama', 'Comedy,Drama,Mystery',
       'Drama,History', 'Comedy,Crime,Drama', 'Drama,History,Romance',
       'Action,Comedy,Crime', 'Drama,Horror,Mystery',
       'Action,Thriller,War', 'Action,Adventure,Horror', 'Drama,War',
       'Biography,Comedy,Drama', 'Adventure,Comedy,Family',
       'Horror,Mystery,Thriller', 'Comedy,Crime,Mystery',
       'Action,Adventure,Drama', 'Fantasy,Horror,Mystery',
       'Biography,Drama,Sport', 'Fantasy,Horror', 'Drama,Romance',
       'Comedy,Drama,Romance', 'Horror,Thriller', 'Action,Thriller',
       'Adventure,Comedy,Drama', 'Action,Drama,History',
       'Biography,Crime,Drama', 'Comedy,Horror', 'Drama,Mystery,Thriller',
       'Thriller', 'Biography,Drama', 'Comedy,Family,Fantasy',
       'Comedy,Drama,Musical', 'Drama,Horror,Romance', 'Comedy,R

-----

# Importing the Movies df from the CSV

In [54]:
# Importing the imdb_movies.csv file
IMDb_movies_small_df = pd.read_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/IMDb_movies.csv')

---

### Filtering for just 10 movies below. This can be removed when running the full code file.

In [55]:
# Filter IMDb_movies_df to the first 10 movies
IMDb_movies_small_df = IMDb_movies_df.head(2)

-----

## Looping the Names of movies to get their URL's

In [56]:
IMDb_movies_small_df.head()

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,Mortal Kombat,"Action,Adventure,Fantasy",6.0,2021,0,110.0,188147,https://www.imdb.com/title/tt0293429/reviews/?...
1,Wazir,"Action,Crime,Drama",7.1,2016,0,103.0,19816,https://www.imdb.com/title/tt0315642/reviews/?...


In [77]:
# Chatgpt

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Setting up Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")  # Headless mode to run without opening browser window
service = Service('path_to_chromedriver')  # Replace 'path_to_chromedriver' with the actual path
# Setup WebDriver (example with Chrome)
driver = webdriver.Chrome()

# Function to get the URL of User Reviews for a movie
def get_user_reviews_url(movie_name, release_date):
    try:
        # Open IMDb website
        driver.get("https://www.imdb.com/")
        time.sleep(2)  # Wait for page to load

        # Find search box and input movie name
        search_box = driver.find_element(By.ID, "suggestion-search")
        search_box.clear()
        search_box.send_keys(movie_name)
        search_box.send_keys(Keys.RETURN)
        time.sleep(2)  # Wait for search results to load

        # Find and click on the release year
        release_year_element = driver.find_element(By.XPATH, f"//*[contains(text(), '{release_date}')]")
        release_year_element.click()
        time.sleep(2)  # Wait for page to load

        # Find and click on User Reviews button
        user_reviews_button = driver.find_element(By.XPATH, "//a[contains(text(), 'User reviews')]")
        user_reviews_url = user_reviews_button.get_attribute('href')
        return user_reviews_url

    except Exception as e:
        print(f"Error occurred for {movie_name}: {str(e)}")
        return None

# Iterate through each row in the DataFrame
for index, row in IMDb_movies_small_df.iterrows():
    movie_name = row['Movie Name']
    release_date = row['Release Date']

    # Get User Reviews URL for the movie
    user_reviews_url = get_user_reviews_url(movie_name, release_date)

    # Update DataFrame with the URL
    IMDb_movies_small_df.at[index, 'User Reviews URL'] = user_reviews_url

# Close the browser
driver.quit()

# Save the updated DataFrame
IMDb_movies_small_df.to_csv('IMDb_movies_with_urls.csv', index=False)  # Replace 'IMDb_movies_with_urls.csv' with desired file name

In [52]:
IMDb_movies_small_df.head()

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,Mortal Kombat,"Action,Adventure,Fantasy",6.0,2021,0,110.0,188147,https://www.imdb.com/title/tt0293429/reviews/?...
1,Wazir,"Action,Crime,Drama",7.1,2016,0,103.0,19816,https://www.imdb.com/title/tt0315642/reviews/?...


----

# Code to scrape just reviews from one URL below below
--------

In [78]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import traceback

# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)

# Navigate to the IMDb reviews page
driver.get("https://www.imdb.com/title/tt0293429/reviews/?ref_=tt_ql_2")

reviews_data = []

try:
    while len(reviews_data) < 200:
        # Wait for the reviews container to load
        print("Waiting for reviews container to load on the page...")
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#main > section > div.lister > div.lister-list")))
        time.sleep(3)

        # Extract reviews
        print("Extracting reviews...")
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        review_elements = soup.select("#main > section > div.lister > div.lister-list > div")

        if not review_elements:
            print("No review elements found within the container.")
            break

        for review_element in review_elements:
            try:
                # Extract the required elements
                review_date = review_element.select_one(".review-date").text.strip()
                review_title = review_element.select_one(".title").text.strip()
                username = review_element.select_one(".display-name-link > a").text.strip()
                helpfulness = review_element.select_one(".actions.text-muted").text.strip()
                helpful_votes, total_votes = map(int, re.findall(r'(\d+)', helpfulness))
                rating = int(review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)").text.strip())
                review_text = review_element.select_one(".text.show-more__control").text.strip()
                spoiler_warning = review_element.select_one(".spoiler-warning")
                spoiler_text = spoiler_warning.text.strip() if spoiler_warning else ""

                reviews_data.append({
                    'Review Date': review_date,
                    'Review Title': review_title,
                    'Username': username,
                    'Helpfulness': helpfulness,
                    'Helpful Votes': helpful_votes,
                    'Total Votes': total_votes,
                    'Rating': rating,
                    'Review Text': review_text,
                    'Spoiler Warning': spoiler_text
                })
                # Break the loop if we have collected 200 reviews
                if len(reviews_data) >= 200:
                    break
            except Exception as e:
                print(f"Error occurred while extracting review data: {e}")
                traceback.print_exc()

        # Check if the "Load More" button is present, then click it
        load_more_button = driver.find_elements(By.ID, "load-more-trigger")
        if load_more_button and len(reviews_data) < 200:
            print("Clicking the 'Load More' button...")
            driver.execute_script("arguments[0].click();", load_more_button[0])
            time.sleep(3)
        else:
            print("No 'Load More' button found, or no more reviews to load.")
            break

except Exception as e:
    print(f"Error occurred: {e}")
    traceback.print_exc()

finally:
    # Convert to DataFrame
    reviews_df = pd.DataFrame(reviews_data)
    print(f"Total reviews scraped: {len(reviews_df)}")

    # Save to CSV if needed
    reviews_df.to_csv("imdb_reviews_scraped.csv", index=False)
    driver.quit()

Waiting for reviews container to load on the page...
Extracting reviews...
Error occurred while extracting review data: 'NoneType' object has no attribute 'text'
Clicking the 'Load More' button...


Traceback (most recent call last):
  File "/var/folders/7_/z9jxc0jj6fx4snlcjg1wby4r0000gn/T/ipykernel_21262/2888158238.py", line 46, in <module>
    rating = int(review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)").text.strip())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'text'


Waiting for reviews container to load on the page...
Extracting reviews...
Error occurred while extracting review data: 'NoneType' object has no attribute 'text'
Clicking the 'Load More' button...


Traceback (most recent call last):
  File "/var/folders/7_/z9jxc0jj6fx4snlcjg1wby4r0000gn/T/ipykernel_21262/2888158238.py", line 46, in <module>
    rating = int(review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)").text.strip())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'text'


Waiting for reviews container to load on the page...
Extracting reviews...
Error occurred while extracting review data: 'NoneType' object has no attribute 'text'
Clicking the 'Load More' button...


Traceback (most recent call last):
  File "/var/folders/7_/z9jxc0jj6fx4snlcjg1wby4r0000gn/T/ipykernel_21262/2888158238.py", line 46, in <module>
    rating = int(review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)").text.strip())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'text'


Waiting for reviews container to load on the page...
Extracting reviews...
Error occurred while extracting review data: 'NoneType' object has no attribute 'text'
No 'Load More' button found, or no more reviews to load.
Total reviews scraped: 200


Traceback (most recent call last):
  File "/var/folders/7_/z9jxc0jj6fx4snlcjg1wby4r0000gn/T/ipykernel_21262/2888158238.py", line 46, in <module>
    rating = int(review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)").text.strip())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'text'


In [66]:
# # Main code that works for just text reviews for 1 movie

# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from bs4 import BeautifulSoup
# import pandas as pd
# import time

# # Initialize WebDriver
# options = webdriver.ChromeOptions()
# options.add_argument("--start-maximized")  # Start browser maximized to avoid hidden elements
# driver = webdriver.Chrome(options=options)
# wait = WebDriverWait(driver, 20)

# # Navigate to the IMDb reviews page
# driver.get("https://www.imdb.com/title/tt0111161/reviews")

# reviews = []
# prev_reviews_count = 0

# try:
#     while len(reviews) < 200:
#         # Wait for the reviews to load and then scroll into view of the Load More button
#         wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.review-container")))
#         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(2)  # Sleep to ensure reviews have loaded

#         # Extract reviews
#         soup = BeautifulSoup(driver.page_source, 'html.parser')
#         new_reviews = [review.text.strip() for review in soup.select("div.review-container .text.show-more__control")]

#         # Check if new reviews have been loaded
#         if len(new_reviews) > prev_reviews_count:
#             # Avoid adding duplicates by extending by the new reviews only
#             reviews.extend(new_reviews[prev_reviews_count:])
#             prev_reviews_count = len(new_reviews)
#         else:
#             print("No new reviews loaded; trying again.")
#             continue
        
#         # Click the "Load More" button, if there are less than 200 reviews
#         if len(reviews) < 200:
#             load_more_button = wait.until(EC.element_to_be_clickable((By.ID, "load-more-trigger")))
#             driver.execute_script("arguments[0].click();", load_more_button)

# except Exception as e:
#     print(f"Error occurred: {e}")

# finally:
#     # Truncate the list to 200 reviews if it exceeds that number
#     reviews = reviews[:200]
#     # Convert to DataFrame
#     reviews_df = pd.DataFrame(reviews, columns=['Review'])
#     print(f"Total reviews scraped: {len(reviews_df)}")

# # Save to CSV if needed
# reviews_df.to_csv("imdb_reviews_scraped.csv", index=False)
# driver.quit()


No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
No new reviews loaded; trying again.
Total reviews scraped: 200
