# Things to fix and add

1. Filter the movies list for a random data sample side.
2. Get the URL's of Movies in a for loop in the Scraping script.
3. Divide scrapping script to 20 different files to not get IP Blocked.

# Getting a list of movies 

In [283]:
import pandas as pd

In [284]:
# Adjust these file paths according to where you've saved the downloaded datasets
basics_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb title basics.tsv'
ratings_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb Title Ratings.tsv'

In [285]:
# Reading the basics file with the correct column names
basics_df = pd.read_csv(
    basics_path, 
    sep='\t', 
    low_memory=False, 
    usecols=['tconst', 'titleType', 'primaryTitle', 'genres', 'isAdult', 'startYear', 'runtimeMinutes']
)

In [286]:
# Filtering for movies
movies_df = basics_df[basics_df['titleType'] == 'movie']

In [287]:
# Reading the ratings file
ratings_df = pd.read_csv(ratings_path, sep='\t', usecols=['tconst', 'averageRating', 'numVotes'])

# Merging the datasets on 'tconst' to combine movie details with ratings
merged_df = pd.merge(movies_df, ratings_df, on='tconst')

# Selecting and renaming the columns
IMDb_movies_df = merged_df[['primaryTitle', 'genres', 'averageRating', 'startYear', 'isAdult', 'runtimeMinutes', 'numVotes']]
IMDb_movies_df.columns = ['Movie Name', 'Genre', 'Rating', 'Release Date', 'isAdult', 'Runtime Minutes', 'numVotes']

In [288]:
# How many null values are there in each column?
print(IMDb_movies_df.isnull().sum())

Movie Name         0
Genre              0
Rating             0
Release Date       0
isAdult            0
Runtime Minutes    0
numVotes           0
dtype: int64


In [289]:
# Display the final table
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
0,Miss Jerry,Romance,5.3,1894,0,45,210
1,The Corbett-Fitzsimmons Fight,"Documentary,News,Sport",5.3,1897,0,100,499
2,Bohemios,\N,3.7,1905,0,100,17
3,The Story of the Kelly Gang,"Action,Adventure,Biography",6.0,1906,0,70,867
4,The Prodigal Son,Drama,5.0,1907,0,90,22
5,Robbery Under Arms,Drama,4.3,1907,0,\N,25
6,Hamlet,Drama,2.9,1908,0,\N,27
7,Don Quijote,Drama,4.2,1908,0,\N,20
8,The Fairylogue and Radio-Plays,"Adventure,Fantasy",5.0,1908,0,120,70
9,Faldgruben,\N,4.4,1909,0,\N,17


In [290]:
# Remove all values from Release date that are not numeric
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Release Date'].str.isnumeric()]

In [291]:
# set datatypes for IMDb_movies_df
IMDb_movies_df['Release Date'] = IMDb_movies_df['Release Date'].astype('int')
# IMDb_movies_df['isAdult'] = IMDb_movies_df['isAdult'].astype('bool')
IMDb_movies_df['Runtime Minutes'] = IMDb_movies_df['Runtime Minutes'].apply(pd.to_numeric, errors='coerce')
IMDb_movies_df['numVotes'] = IMDb_movies_df['numVotes'].apply(pd.to_numeric, errors='coerce')

In [292]:
# Filter IMDb_movies_df for Release date > 2015
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Release Date'] > 2015]

# Remove all num values from Genre column
IMDb_movies_df['Genre'] = IMDb_movies_df['Genre'].str.replace(r'\d+', '')



In [293]:
# List all unique values in the Release date column
IMDb_movies_df['Release Date'].unique()

array([2021, 2020, 2018, 2023, 2022, 2017, 2016, 2019, 2024])

In [294]:
# Remove all values from IMDb_movies_df columns that have a value /N
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Movie Name'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Rating'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['isAdult'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Runtime Minutes'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['numVotes'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Rating'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Genre'] != '\\N']

In [295]:
IMDb_movies_df.shape

(77118, 7)

In [296]:
# Display the final table
IMDb_movies_df.head(5)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
1792,Istoriya grazhdanskoy voyny,Documentary,6.8,2021,0,94.0,64
32409,The Tango of the Widower and Its Distorting Mi...,Drama,6.4,2020,0,70.0,186
36947,The Other Side of the Wind,Drama,6.7,2018,0,122.0,7966
37953,Socialist Realism,Drama,7.6,2023,0,78.0,53
45841,Victor Seastrom,"Biography,Documentary",6.7,2021,0,65.0,67


In [297]:
# Filter any movies with less than 10000 votes
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['numVotes'] > 10000]

In [298]:
# Display the final table
IMDb_movies_df.head(5)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
110436,Mortal Kombat,"Action,Adventure,Fantasy",6.0,2021,0,110.0,188147
114550,Wazir,"Action,Crime,Drama",7.1,2016,0,103.0,19816
122345,Fahrenheit 451,"Drama,Sci-Fi,Thriller",5.0,2018,0,100.0,22392
125034,American Pastoral,"Crime,Drama,Mystery",6.1,2016,0,108.0,17946
126698,Motherless Brooklyn,"Crime,Drama,Mystery",6.8,2019,0,144.0,62334


In [299]:
IMDb_movies_df.shape

(2667, 7)

In [300]:
# List all duplicated Movie Names in IMDb_movies_df
IMDb_movies_df[IMDb_movies_df.duplicated(subset='Movie Name', keep=False)].sort_values(by='Movie Name')

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
153836,Aftermath,"Drama,Horror,Mystery",5.3,2021,0,114.0,18577
261362,Aftermath,"Drama,Mystery,Thriller",5.7,2017,0,94.0,26633
173119,Beast,"Action,Adventure,Drama",5.6,2022,0,93.0,39471
158784,Beast,"Action,Comedy,Thriller",5.2,2022,0,155.0,36150
272316,Beast,"Crime,Drama,Mystery",6.8,2017,0,107.0,15220
150710,Black Box,"Drama,Mystery,Thriller",7.2,2021,0,129.0,14169
166199,Black Box,"Horror,Mystery,Sci-Fi",6.2,2020,0,100.0,16957
283927,Champions,"Comedy,Drama,Family",7.2,2018,0,124.0,11684
189969,Champions,"Comedy,Drama,Sport",6.8,2023,0,124.0,24312
303811,Close,Drama,7.8,2022,0,104.0,32563


In [301]:
# Remove all rows where the Movie Name, Release Date and Runtime Minutes are duplicated
IMDb_movies_df = IMDb_movies_df.drop_duplicates(subset=['Movie Name', 'Release Date', 'Runtime Minutes'])

In [302]:
# Reset index
IMDb_movies_df.reset_index(drop=True, inplace=True)

In [303]:
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
0,Mortal Kombat,"Action,Adventure,Fantasy",6.0,2021,0,110.0,188147
1,Wazir,"Action,Crime,Drama",7.1,2016,0,103.0,19816
2,Fahrenheit 451,"Drama,Sci-Fi,Thriller",5.0,2018,0,100.0,22392
3,American Pastoral,"Crime,Drama,Mystery",6.1,2016,0,108.0,17946
4,Motherless Brooklyn,"Crime,Drama,Mystery",6.8,2019,0,144.0,62334
5,Alita: Battle Angel,"Action,Adventure,Sci-Fi",7.3,2019,0,122.0,290565
6,The Flash,"Action,Adventure,Fantasy",6.7,2023,0,144.0,201276
7,Danger Close,"Action,Drama,War",6.8,2019,0,118.0,14990
8,Shazam!,"Action,Adventure,Comedy",7.0,2019,0,132.0,380869
9,Wonder Woman,"Action,Adventure,Fantasy",7.3,2017,0,141.0,694150


In [304]:
IMDb_movies_df.shape

(2667, 7)

In [305]:
# New column called Random that randomly generates random unique integer values between 1 and 1000000
import random
random.seed(0)
IMDb_movies_df['Random'] = random.sample(range(1, 1000000), IMDb_movies_df.shape[0])

In [306]:
# how many duplicate values are there in the Random column?
IMDb_movies_df['Random'].duplicated().sum()

0

In [307]:
# sort the random column in ascending order
IMDb_movies_df.sort_values(by='Random', inplace=True)

# filter the top 1000 rows
IMDb_movies_df = IMDb_movies_df.head(1000)

In [308]:
# Reset index
IMDb_movies_df.reset_index(drop=True, inplace=True)

In [309]:
IMDb_movies_df.shape

(1000, 8)

In [310]:
IMDb_movies_df.head(10)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,Random
0,Phone Bhoot,"Comedy,Horror",5.2,2022,0,136.0,10841,73
1,Office Christmas Party,Comedy,5.9,2016,0,105.0,87808,253
2,The Boogeyman,"Horror,Mystery,Thriller",5.9,2023,0,98.0,41911,266
3,The Woman King,"Action,Drama,History",6.9,2022,0,135.0,71193,762
4,A Shaun the Sheep Movie: Farmageddon,"Adventure,Animation,Comedy",6.8,2019,0,86.0,15422,1062
5,Gemini Man,"Action,Sci-Fi,Thriller",5.7,2019,0,117.0,120953,1199
6,Doctor Strange,"Action,Adventure,Fantasy",7.5,2016,0,115.0,795835,1599
7,Sully,"Biography,Drama",7.4,2016,0,96.0,294926,1933
8,The Irishman,"Biography,Crime,Drama",7.8,2019,0,209.0,425536,2473
9,777 Charlie,"Adventure,Comedy,Drama",8.8,2022,0,164.0,38956,2581


### Dividing the Movies dataframe into 20 csv's

In [311]:
# divide the IMDb_movies_df into 20 separate dataframes with 50 rows each and save them as separate CSV files with the naming convention "IMDb_movies_df_1.csv", "IMDb_movies_df_2.csv", "IMDb_movies_df_3.csv", and so on in the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets'
import os

# drop random column
IMDb_movies_df.drop('Random', axis=1, inplace=True)

# Create the directory
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets'
if not os.path.exists(directory):
    os.makedirs(directory)

# Split the dataframe into 20 separate dataframes
for i in range(20):
    start = i * 50
    end = (i + 1) * 50
    df = IMDb_movies_df.iloc[start:end]
    df.to_csv(f'{directory}/IMDb_movies_df_{i + 1}.csv', index=False)

In [312]:
# # export the final table to a csv file
# IMDb_movies_df.to_csv('IMDb_movies.csv', index=False)

-----

## Importing all 20 Movies CSV

In [313]:
import pandas as pd
import os

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets'

# Initialize an empty dictionary to store the dataframes
dataframes = {}

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Remove the '.csv' extension and use the filename as the dictionary key
        dataframe_name = filename.replace('.csv', '')
        # Store the DataFrame in the dictionary
        dataframes[dataframe_name] = df

In [314]:
# Defining all dataframes as variables

imdb_movies_df_1 = dataframes['IMDb_movies_df_1']
imdb_movies_df_2 = dataframes['IMDb_movies_df_2']
imdb_movies_df_3 = dataframes['IMDb_movies_df_3']
imdb_movies_df_4 = dataframes['IMDb_movies_df_4']
imdb_movies_df_5 = dataframes['IMDb_movies_df_5']
imdb_movies_df_6 = dataframes['IMDb_movies_df_6']
imdb_movies_df_7 = dataframes['IMDb_movies_df_7']
imdb_movies_df_8 = dataframes['IMDb_movies_df_8']
imdb_movies_df_9 = dataframes['IMDb_movies_df_9']
imdb_movies_df_10 = dataframes['IMDb_movies_df_10']
imdb_movies_df_11 = dataframes['IMDb_movies_df_11']
imdb_movies_df_12 = dataframes['IMDb_movies_df_12']
imdb_movies_df_13 = dataframes['IMDb_movies_df_13']
imdb_movies_df_14 = dataframes['IMDb_movies_df_14']
imdb_movies_df_15 = dataframes['IMDb_movies_df_15']
imdb_movies_df_16 = dataframes['IMDb_movies_df_16']
imdb_movies_df_17 = dataframes['IMDb_movies_df_17']
imdb_movies_df_18 = dataframes['IMDb_movies_df_18']
imdb_movies_df_19 = dataframes['IMDb_movies_df_19']
imdb_movies_df_20 = dataframes['IMDb_movies_df_20']

In [315]:
# This code does the same as what is happening above but in a loop but when using the imdb_movies_df_* the highlight shows up as the fact that the dataframe does not exist. It was driving me nuts so I just did it manually

# # Loop through the numbers 1 to 20 to define each dataframe
# for i in range(1, 21):
#     # Construct the key name based on the current number
#     key_name = f'IMDb_movies_df_{i}'
#     # Use exec to create a variable with the dataframe
#     exec(f'imdb_movies_df_{i} = dataframes["{key_name}"]')

In [316]:
imdb_movies_df_20.head(5)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes
0,I Kill Giants,"Drama,Fantasy,Thriller",6.1,2017,0,106.0,32840
1,Whiskey Tango Foxtrot,"Biography,Comedy,Drama",6.6,2016,0,112.0,55860
2,Dunki,"Comedy,Drama",7.2,2023,0,160.0,62137
3,When We First Met,"Comedy,Fantasy,Romance",6.4,2018,0,97.0,53509
4,The Edge of Democracy,"Documentary,History",7.3,2019,0,121.0,15646


---

# URL Extracting script for all 20 dataframes

In [None]:
# Working URL extracting sctipt for all 20 dataframes

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Setting up Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")  # Headless mode to run without opening browser window
service = Service('path_to_chromedriver')  # Replace 'path_to_chromedriver' with the actual path
# Setup WebDriver (example with Chrome)
driver = webdriver.Chrome()

# Function to get the URL of User Reviews for a movie
def get_user_reviews_url(movie_name, release_date):
    try:
        # Open IMDb website
        driver.get("https://www.imdb.com/")
        time.sleep(2)  # Wait for page to load

        # Find search box and input movie name
        search_box = driver.find_element(By.ID, "suggestion-search")
        search_box.clear()
        search_box.send_keys(movie_name)
        search_box.send_keys(Keys.RETURN)
        time.sleep(2)  # Wait for search results to load

        # Find and click on the release year
        release_year_element = driver.find_element(By.XPATH, f"//*[contains(text(), '{release_date}')]")
        release_year_element.click()
        time.sleep(2)  # Wait for page to load

        # Find and click on User Reviews button
        user_reviews_button = driver.find_element(By.XPATH, "//a[contains(text(), 'User reviews')]")
        user_reviews_url = user_reviews_button.get_attribute('href')
        return user_reviews_url

    except Exception as e:
        print(f"Error occurred for {movie_name}: {str(e)}")
        return None

# Loop through each dataframe
for i in range(1, 21):
    # Dynamically generate dataframe name
    df_name = f'imdb_movies_df_{i}'
    # Assuming dataframes are stored in a dictionary or similar structure
    df = globals()[df_name]  # Replace with the actual way to access the dataframe

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        movie_name = row['Movie Name']
        release_date = row['Release Date']

        # Get User Reviews URL for the movie
        user_reviews_url = get_user_reviews_url(movie_name, release_date)

        # Update DataFrame with the URL
        df.at[index, 'User Reviews URL'] = user_reviews_url

    # Save the updated DataFrame
    output_path = f'/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL/imdb_movies_df_{i}_url.csv'
    df.to_csv(output_path, index=False)

# Close the browser
driver.quit()

-----

# Descriptive stats for all URL df

In [9]:
import os
import pandas as pd

# import all 20 csv files from the folder location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL' The files are named imdb_movies_df_1_url.csv, imdb_movies_df_2_url.csv, imdb_movies_df_3_url.csv, and so on till 20. I want to concatenate all these files into one dataframe called all_URL_df

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL'

# Initialize an empty list to store the dataframes
dataframes = []

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames in the list
all_URL_df = pd.concat(dataframes, ignore_index=True)

In [10]:
# Display the final table
all_URL_df.head(5)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,Prey,"Action,Adventure,Drama",7.1,2022,0,100.0,222499,https://www.imdb.com/title/tt11866324/reviews/...
1,The Bye Bye Man,"Drama,Fantasy,Horror",4.3,2017,0,96.0,25774,https://www.imdb.com/title/tt4030600/reviews/?...
2,Fyre,"Crime,Documentary,Music",7.2,2019,0,97.0,51546,https://www.imdb.com/title/tt9412098/reviews/?...
3,Escape Room,"Action,Adventure,Horror",6.4,2019,0,99.0,139759,https://www.imdb.com/title/tt5886046/reviews/?...
4,Salaar,"Action,Crime,Drama",6.5,2023,0,175.0,57702,https://www.imdb.com/title/tt13927994/reviews/...


In [11]:
all_URL_df.shape

(1000, 8)

In [12]:
# all null values in each column
all_URL_df.isnull().sum()

Movie Name           0
Genre                0
Rating               0
Release Date         0
isAdult              0
Runtime Minutes      0
numVotes             0
User Reviews URL    60
dtype: int64

In [13]:
# set datatypes for all_URL_df all columns
all_URL_df['Movie Name'] = all_URL_df['Movie Name'].astype('string')
all_URL_df['Genre'] = all_URL_df['Genre'].astype('string')
all_URL_df['Rating'] = all_URL_df['Rating'].astype('float')
all_URL_df['Release Date'] = all_URL_df['Release Date'].astype('int')
all_URL_df['isAdult'] = all_URL_df['isAdult'].astype('int')
all_URL_df['Runtime Minutes'] = all_URL_df['Runtime Minutes'].astype('float')
all_URL_df['numVotes'] = all_URL_df['numVotes'].astype('float')
all_URL_df['User Reviews URL'] = all_URL_df['User Reviews URL'].astype('string')


In [14]:
# filter for Charlie 777 in the all_URL_df dataframe
all_URL_df[all_URL_df['Movie Name'] == 'Troop Zero']

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
111,Troop Zero,"Comedy,Drama,Family",6.9,2019,0,94.0,12028.0,https://www.imdb.com/title/tt2404465/reviews/?...


-----

## Loading all 20 URL csv as Dataframes

In [3]:
# load all 20 csv from the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL' as individual dataframes with the same name as their file name but without the .csv extension

import os
import pandas as pd

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL'

# Initialize an empty dictionary to store the dataframes
dataframes = {}

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Remove the '.csv' extension and use the filename as the dictionary key
        dataframe_name = filename.replace('.csv', '')
        # Store the DataFrame in the dictionary
        dataframes[dataframe_name] = df

In [4]:
# Defining all dataframes as variables

imdb_movies_df_1_url = dataframes['imdb_movies_df_1_url']
imdb_movies_df_2_url = dataframes['imdb_movies_df_2_url']
imdb_movies_df_3_url = dataframes['imdb_movies_df_3_url']
imdb_movies_df_4_url = dataframes['imdb_movies_df_4_url']
imdb_movies_df_5_url = dataframes['imdb_movies_df_5_url']
imdb_movies_df_6_url = dataframes['imdb_movies_df_6_url']
imdb_movies_df_7_url = dataframes['imdb_movies_df_7_url']
imdb_movies_df_8_url = dataframes['imdb_movies_df_8_url']
imdb_movies_df_9_url = dataframes['imdb_movies_df_9_url']
imdb_movies_df_10_url = dataframes['imdb_movies_df_10_url']
imdb_movies_df_11_url = dataframes['imdb_movies_df_11_url']
imdb_movies_df_12_url = dataframes['imdb_movies_df_12_url']
imdb_movies_df_13_url = dataframes['imdb_movies_df_13_url']
imdb_movies_df_14_url = dataframes['imdb_movies_df_14_url']
imdb_movies_df_15_url = dataframes['imdb_movies_df_15_url']
imdb_movies_df_16_url = dataframes['imdb_movies_df_16_url']
imdb_movies_df_17_url = dataframes['imdb_movies_df_17_url']
imdb_movies_df_18_url = dataframes['imdb_movies_df_18_url']
imdb_movies_df_19_url = dataframes['imdb_movies_df_19_url']
imdb_movies_df_20_url = dataframes['imdb_movies_df_20_url']

In [5]:
imdb_movies_df_18_url.head(20)

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,24 Hours to Live,"Action,Sci-Fi,Thriller",5.8,2017,0,93.0,15968,https://www.imdb.com/title/tt5442456/reviews/?...
1,Promising Young Woman,"Crime,Drama,Mystery",7.5,2020,0,113.0,207699,https://www.imdb.com/title/tt9620292/reviews/?...
2,Love Today,"Comedy,Drama,Romance",8.0,2022,0,154.0,15188,https://www.imdb.com/title/tt22488728/reviews/...
3,American Made,"Action,Comedy,Crime",7.1,2017,0,115.0,205078,https://www.imdb.com/title/tt3532216/reviews/?...
4,Colossal,"Comedy,Drama,Fantasy",6.2,2016,0,109.0,68598,https://www.imdb.com/title/tt4680182/reviews/?...
5,Kedarnath,"Drama,Romance",6.7,2018,0,116.0,15036,https://www.imdb.com/title/tt7027278/reviews/?...
6,The Half of It,"Comedy,Drama",6.9,2020,0,104.0,42737,https://www.imdb.com/title/tt9683478/reviews/?...
7,Coming 2 America,"Comedy,Music",5.3,2021,0,110.0,74799,https://www.imdb.com/title/tt6802400/reviews/?...
8,Color Out of Space,"Horror,Mystery,Sci-Fi",6.2,2019,0,111.0,55288,https://www.imdb.com/title/tt5073642/reviews/?...
9,"Crouching Tiger, Hidden Dragon: Sword of Destiny","Action,Adventure,Drama",6.1,2016,0,96.0,20770,https://www.imdb.com/title/tt2652118/reviews/?...


--------

# Testing with small data for all scrapping

In [7]:
imdb_movies_df_1_url_small = imdb_movies_df_1_url.head(2)
imdb_movies_df_2_url_small = imdb_movies_df_2_url.head(2)

In [8]:
imdb_movies_df_1_url_small

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,Phone Bhoot,"Comedy,Horror",5.2,2022,0,136.0,10841,https://www.imdb.com/title/tt12740760/reviews/...
1,Office Christmas Party,Comedy,5.9,2016,0,105.0,87808,https://www.imdb.com/title/tt1711525/reviews/?...


In [9]:
imdb_movies_df_2_url_small

Unnamed: 0,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,Fighting with My Family,"Biography,Comedy,Drama",7.1,2019,0,108.0,86793,https://www.imdb.com/title/tt6513120/reviews/?...
1,Bill & Ted Face the Music,"Adventure,Comedy,Music",5.9,2020,0,91.0,52517,https://www.imdb.com/title/tt1086064/reviews/?...


---

# Scrapping code
-----

In [None]:
# Technically this code should scrape it for all 20 dataframes

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import traceback
import os

# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)

# Function to scrape reviews with a limit of 1000 reviews per movie
def scrape_reviews(url):
    driver.get(url)
    reviews_data = []
    try:
        while len(reviews_data) < 1000:  # Limit to 1000 reviews
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#main > section > div.lister > div.lister-list")))
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            review_elements = soup.select("#main > section > div.lister > div.lister-list > div")
            if not review_elements:
                break
            for review_element in review_elements:
                if len(reviews_data) >= 1000:  # Check if we have already collected 1000 reviews
                    break
                try:
                    individual_rating_element = review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)")
                    individual_rating = int(individual_rating_element.text.strip()) if individual_rating_element else None
                    review_data = {
                        'Review Date': review_element.select_one(".review-date").text.strip(),
                        'Review Title': review_element.select_one(".title").text.strip(),
                        'Username': review_element.select_one(".display-name-link > a").text.strip(),
                        'Helpfulness': review_element.select_one(".actions.text-muted").text.strip(),
                        'Helpful Votes': int(re.findall(r'(\d+)', review_element.select_one(".actions.text-muted").text.strip())[0]),
                        'Total Votes': int(re.findall(r'(\d+)', review_element.select_one(".actions.text-muted").text.strip())[1]),
                        'Individual Rating': individual_rating,
                        'Review Text': review_element.select_one(".text.show-more__control").text.strip(),
                        'Spoiler Warning': review_element.select_one(".spoiler-warning").text.strip() if review_element.select_one(".spoiler-warning") else ""
                    }
                    reviews_data.append(review_data)
                except Exception as e:
                    print(f"Error occurred while extracting review data: {e}")
                    traceback.print_exc()
            load_more_button = driver.find_elements(By.ID, "load-more-trigger")
            if load_more_button and len(reviews_data) < 1000:  # Check if we can load more reviews
                driver.execute_script("arguments[0].click();", load_more_button[0])
                time.sleep(3)
            else:
                break
    except Exception as e:
        print(f"Error occurred: {e}")
        traceback.print_exc()
    finally:
        return reviews_data

# Directory to save CSV files
save_dir = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_Scrapped_data'

# Main loop to iterate through dataframes
for i in range(1, 21):  # Assuming there are 20 dataframes
    df_name = f'imdb_movies_df_{i}_url'
    df = eval(df_name)  # Dynamically get the dataframe
    all_reviews = []
    for index, row in df.iterrows():
        movie_reviews = scrape_reviews(row['User Reviews URL'])
        for review in movie_reviews:
            movie_data = row.to_dict()
            movie_data['Rating'] = movie_data.get('Rating')  # Ensure the 'Rating' key is from the dataframe
            review.update(movie_data)  # Add movie data to each review
        all_reviews.extend(movie_reviews)
    
    # Convert to DataFrame
    final_df = pd.DataFrame(all_reviews)
    
    # Save to CSV
    csv_file_path = os.path.join(save_dir, f'all_scrapped_data_{i}.csv')
    final_df.to_csv(csv_file_path, index=False)
    print(f"Dataframe {i} reviews saved to {csv_file_path}")

driver.quit()

# Importing all 20 scrapped CSV's and concatenating them
-----

In [1]:
# Import all 20 csv files from the folder location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_Scrapped_data' The files are named all_scrapped_data_1.csv, all_scrapped_data_2.csv, all_scrapped_data_3.csv, and so on till 20. I want to concatenate all these files into one dataframe called all_reviews_df. First merge the first 2 then clear the memory and then merge the next one with the just merged one and so on till all 20 are merged.

import os
import pandas as pd

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_Scrapped_data'

# Initialize an empty list to store the dataframes
dataframes = []

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames in the list
all_reviews_df = pd.concat(dataframes, ignore_index=True)

In [2]:
all_reviews_df

Unnamed: 0,Review Date,Review Title,Username,Helpfulness,Helpful Votes,Total Votes,Individual Rating,Review Text,Spoiler Warning,Movie Name,Genre,Rating,Release Date,isAdult,Runtime Minutes,numVotes,User Reviews URL
0,9 March 2023,Better than I expected.,deloudelouvain,44 out of 54 found this helpful.\n ...,44,54,8.0,Sharper didn't fail to entertain me. A perfect...,,Sharper,"Crime,Drama,Thriller",6.7,2023,0,116.0,29167,https://www.imdb.com/title/tt12573454/reviews/...
1,22 February 2023,Who's conning whom exactly?,paul-allaer,37 out of 46 found this helpful.\n ...,37,46,6.0,"As ""Sharper"" (2023 release; 116 min.) opens, w...",,Sharper,"Crime,Drama,Thriller",6.7,2023,0,116.0,29167,https://www.imdb.com/title/tt12573454/reviews/...
2,19 February 2023,I liked this more than I expected to,bk753,88 out of 111 found this helpful.\n ...,88,111,8.0,I have to stop coming here and using User Revi...,,Sharper,"Crime,Drama,Thriller",6.7,2023,0,116.0,29167,https://www.imdb.com/title/tt12573454/reviews/...
3,19 February 2023,In a genre that is hard to succeed in these da...,jtindahouse,51 out of 74 found this helpful.\n ...,51,74,8.0,'Sharper' is my kind of movie. But there's a r...,,Sharper,"Crime,Drama,Thriller",6.7,2023,0,116.0,29167,https://www.imdb.com/title/tt12573454/reviews/...
4,11 February 2023,A Stylish Thriller That Misses the Mark,FilmFanatic2023,84 out of 123 found this helpful.\n ...,84,123,6.0,A stylish thriller that tries to mimic the sty...,,Sharper,"Crime,Drama,Thriller",6.7,2023,0,116.0,29167,https://www.imdb.com/title/tt12573454/reviews/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926216,29 December 2022,Finger-lickin' bad,stevejones-9,28 out of 43 found this helpful.\n ...,28,43,5.0,"So, on paper this film has potential, right? H...",Warning: Spoilers,The Banshees of Inisherin,"Comedy,Drama",7.7,2022,0,114.0,245171,https://www.imdb.com/title/tt11813216/reviews/...
926217,24 December 2022,Weird and Truly Hideous!,dan_slentz,26 out of 52 found this helpful.\n ...,26,52,3.0,What an UN-feel-good movie. Unpleasant. Uncomf...,,The Banshees of Inisherin,"Comedy,Drama",7.7,2022,0,114.0,245171,https://www.imdb.com/title/tt11813216/reviews/...
926218,4 February 2023,Nominated for HOW many Oscars?,mike-499-205871,9 out of 13 found this helpful.\n ...,9,13,3.0,Watched this after reading all the rave review...,,The Banshees of Inisherin,"Comedy,Drama",7.7,2022,0,114.0,245171,https://www.imdb.com/title/tt11813216/reviews/...
926219,18 December 2022,The quality of movies has gone so far down...,DrProfessor,14 out of 25 found this helpful.\n ...,14,25,6.0,The quality of movies has gone so far down rev...,Warning: Spoilers,The Banshees of Inisherin,"Comedy,Drama",7.7,2022,0,114.0,245171,https://www.imdb.com/title/tt11813216/reviews/...


In [3]:
all_reviews_df.shape

(926221, 17)

In [4]:
all_reviews_df.columns

Index(['Review Date', 'Review Title', 'Username', 'Helpfulness',
       'Helpful Votes', 'Total Votes', 'Individual Rating', 'Review Text',
       'isAdult', 'Runtime Minutes', 'numVotes', 'User Reviews URL'],
      dtype='object')

In [5]:
# all unique values in the Movie name column in all_reviews_df
all_reviews_df['Movie Name'].unique()

array(['Sharper', 'Kabali', 'Happy as Lazzaro', 'Unlocked',
       'Thanksgiving', 'The Last Kingdom: Seven Kings Must Die',
       'Knock at the Cabin', 'The Meyerowitz Stories',
       'In a Valley of Violence', 'Quo Vadis, Aida?', 'Nightbooks',
       'Resident Evil: Welcome to Raccoon City', 'The Out-Laws',
       'Free Guy', 'Down a Dark Hall', 'Moana',
       'Clifford the Big Red Dog', 'The Lodge', 'Cadaver', 'Loving Pablo',
       'Hamilton', "Pete's Dragon", 'Countdown', 'Nocturne', 'Long Shot',
       'El Camino: A Breaking Bad Movie', 'At the End of the Tunnel',
       'Pitch Perfect 3', "Won't You Be My Neighbor?", 'Tall Girl',
       'Kaiva', 'The Spy Who Dumped Me', 'The Possession of Hannah Grace',
       'A Private War', 'Jersey', 'A Hero', 'Cats',
       'Munich: The Edge of War', 'Come Play', 'Can You Ever Forgive Me?',
       'Mank', 'Prospect', 'Rambo: Last Blood', 'The Zone of Interest',
       'Ben-Hur', "He's All That", 'Loving', 'The Accountant',
       'Ghost S

In [6]:
# Number of all unique values in the Movie name column in all_reviews_df
all_reviews_df['Movie Name'].nunique()

937

In [9]:
# How many times a value appears in the Movie name column in all_reviews_df
repeats = all_reviews_df['Movie Name'].value_counts()

In [10]:
repeats.head(25)

Movie Name
Beast                             2000
The Guilty                        2000
The Good Neighbor                 2000
Vox Lux                           1000
Superintelligence                 1000
The Suicide Squad                 1000
Shaft                             1000
Lamb                              1000
Happening                         1000
The Laundromat                    1000
A Monster Calls                   1000
Khaali Peeli                      1000
Nitram                            1000
Truth or Dare                     1000
Everything, Everything            1000
Perfect Strangers                 1000
Rust Creek                        1000
Mid90s                            1000
The Man Who Killed Don Quixote    1000
Collective                        1000
Halloween Ends                    1000
The Farewell                      1000
tick, tick... BOOM!               1000
Kickboxer: Vengeance              1000
Vivegam                           1000
Name: count, d

In [12]:
# filter repeats for values more than 999
repeats = repeats[repeats > 999]

In [14]:
repeats.shape

(923,)