# Getting a list of movies
------

In [None]:
import pandas as pd

In [None]:
# Adjust these file paths according to where you've saved the downloaded datasets
basics_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb title basics.tsv'
ratings_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb Title Ratings.tsv'

In [None]:
# Reading the basics file with the correct column names
basics_df = pd.read_csv(
    basics_path, 
    sep='\t', 
    low_memory=False, 
    usecols=['tconst', 'titleType', 'primaryTitle', 'genres', 'isAdult', 'startYear', 'runtimeMinutes']
)

In [None]:
# Filtering for movies
movies_df = basics_df[basics_df['titleType'] == 'movie']

In [None]:
# Reading the ratings file
ratings_df = pd.read_csv(ratings_path, sep='\t', usecols=['tconst', 'averageRating', 'numVotes'])

# Merging the datasets on 'tconst' to combine movie details with ratings
merged_df = pd.merge(movies_df, ratings_df, on='tconst')

# Selecting and renaming the columns
IMDb_movies_df = merged_df[['primaryTitle', 'genres', 'averageRating', 'startYear', 'isAdult', 'runtimeMinutes', 'numVotes']]
IMDb_movies_df.columns = ['Movie Name', 'Genre', 'Rating', 'Release Date', 'isAdult', 'Runtime Minutes', 'numVotes']

In [None]:
# Remove all values from Release date that are not numeric
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Release Date'].str.isnumeric()]

In [None]:
# set datatypes for IMDb_movies_df
IMDb_movies_df['Release Date'] = IMDb_movies_df['Release Date'].astype('int')
IMDb_movies_df['Runtime Minutes'] = IMDb_movies_df['Runtime Minutes'].apply(pd.to_numeric, errors='coerce')
IMDb_movies_df['numVotes'] = IMDb_movies_df['numVotes'].apply(pd.to_numeric, errors='coerce')

In [None]:
# Filter IMDb_movies_df for Release date > 2015
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Release Date'] > 2015]

# Remove all num values from Genre column
IMDb_movies_df['Genre'] = IMDb_movies_df['Genre'].str.replace(r'\d+', '')

In [None]:
# Remove all values from IMDb_movies_df columns that have a value /N
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Movie Name'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Rating'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['isAdult'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Runtime Minutes'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['numVotes'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Rating'] != '\\N']
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['Genre'] != '\\N']

In [None]:
# Filter any movies with less than 10000 votes
IMDb_movies_df = IMDb_movies_df[IMDb_movies_df['numVotes'] > 10000]

In [None]:
# Remove all rows where the Movie Name, Release Date and Runtime Minutes are duplicated
IMDb_movies_df = IMDb_movies_df.drop_duplicates(subset=['Movie Name', 'Release Date', 'Runtime Minutes'])

In [None]:
# Reset index
IMDb_movies_df.reset_index(drop=True, inplace=True)

In [None]:
IMDb_movies_df.shape

### Dividing the Movies dataframe into 40 csv's

In [None]:
import os
import math

# Create the directory
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets'
if not os.path.exists(directory):
    os.makedirs(directory)

# Calculate the number of rows per chunk
chunk_size = 2667 // 40
remaining_rows = 2667 % 40

# Split the dataframe into chunks and save as separate CSV files
for i in range(40):
    start = i * chunk_size
    end = start + chunk_size
    if i == 39:  # Last chunk
        end += remaining_rows
    df = IMDb_movies_df.iloc[start:end]
    df.to_csv(f'{directory}/IMDb_movies_df_{i + 1}.csv', index=False)

## Importing all Movies CSV
-----

In [None]:
import pandas as pd
import os

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets'

# Initialize an empty dictionary to store the dataframes
dataframes = {}

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Remove the '.csv' extension and use the filename as the dictionary key
        dataframe_name = filename.replace('.csv', '')
        # Store the DataFrame in the dictionary
        dataframes[dataframe_name] = df

In [None]:
# Defining all dataframes as variables
for i in range(1, 41):
    globals()[f"imdb_movies_df_{i}"] = dataframes[f"imdb_movies_df_{i}"]

In [None]:
imdb_movies_df_1.shape

# URL Extracting script for all dataframes
------

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Setting up Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")  # Headless mode to run without opening browser window
service = Service('path_to_chromedriver')  # Replace 'path_to_chromedriver' with the actual path
# Setup WebDriver (example with Chrome)
driver = webdriver.Chrome()

# Function to get the URL of User Reviews for a movie
def get_user_reviews_url(movie_name, release_date):
    try:
        # Open IMDb website
        driver.get("https://www.imdb.com/")
        time.sleep(4)  # Wait for page to load

        # Find search box and input movie name
        search_box = driver.find_element(By.ID, "suggestion-search")
        search_box.clear()
        search_box.send_keys(movie_name)
        search_box.send_keys(Keys.RETURN)
        time.sleep(4)  # Wait for search results to load

        # Find and click on the release year
        release_year_element = driver.find_element(By.XPATH, f"//*[contains(text(), '{release_date}')]")
        release_year_element.click()
        time.sleep(4)  # Wait for page to load

        # Find User Reviews button using the updated XPath selector
        user_reviews_button = driver.find_element(By.XPATH, "//a[@class='ipc-link ipc-link--baseAlt ipc-link--inherit-color' and @role='button' and contains(@href, '/reviews')]")
        user_reviews_url = user_reviews_button.get_attribute('href')
        return user_reviews_url

    except Exception as e:
        print(f"Error occurred for {movie_name}: {str(e)}")
        return None

# Loop through each dataframe
for i in range(1, 41):
    # Dynamically generate dataframe name
    df_name = f'imdb_movies_df_{i}'
    # Assuming dataframes are stored in a dictionary or similar structure
    df = globals()[df_name]  # Replace with the actual way to access the dataframe

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        movie_name = row['Movie Name']
        release_date = row['Release Date']

        # Get User Reviews URL for the movie
        user_reviews_url = get_user_reviews_url(movie_name, release_date)

        # Update DataFrame with the URL
        df.at[index, 'User Reviews URL'] = user_reviews_url

    # Save the updated DataFrame
    output_path = f'/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL/imdb_movies_df_{i}.csv'
    df.to_csv(output_path, index=False)

# Close the browser
driver.quit()

-----

# Descriptive stats for all URL df
------

In [None]:
import os
import pandas as pd

# import all 20 csv files from the folder location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL' The files are named imdb_movies_df_1_url.csv, imdb_movies_df_2_url.csv, imdb_movies_df_3_url.csv, and so on till 20. I want to concatenate all these files into one dataframe called all_URL_df

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL'

# Initialize an empty list to store the dataframes
dataframes = []

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames in the list
all_URL_df = pd.concat(dataframes, ignore_index=True)

In [None]:
# Display the final table
all_URL_df.head()

In [None]:
all_URL_df.shape

In [None]:
# all null values in each column
all_URL_df.isnull().sum()

In [None]:
# List all unique values in the Release Date in the all_URL_df
all_URL_df['Release Date'].unique()

In [None]:
# List the number of duplicate values which have the same 'release_date', 'Runtime Minutes' and 'Movie Name' in the all_URL_df.
duplicate_count = all_URL_df.duplicated(subset=['Release Date', 'Runtime Minutes', 'Movie Name', 'Rating']).sum()
print(f"Number of duplicate values: {duplicate_count}")

# Delete all duplicate rows where the 'release_date', 'Runtime Minutes' and 'Movie Name' in the all_URL_df are the same.
all_URL_df = all_URL_df.drop_duplicates(subset=['Release Date', 'Runtime Minutes', 'Movie Name', 'Rating'])

In [None]:
# List the number of duplicate values which have the same 'release_date', 'Runtime Minutes' and 'Movie Name' in the all_URL_df.
duplicate_count = all_URL_df.duplicated(subset=['Release Date', 'Runtime Minutes', 'Movie Name', 'Rating']).sum()
print(f"Number of duplicate values: {duplicate_count}")

In [None]:
all_URL_df.shape

In [None]:
# all null values in each column
all_URL_df.isnull().sum()

In [None]:
# remove all rows where the User Reviews URL is null
all_URL_df = all_URL_df[all_URL_df['User Reviews URL'].notnull()]

all_URL_df = all_URL_df[all_URL_df['Runtime Minutes'].notnull()]

In [None]:
# all null values in each column
all_URL_df.isnull().sum()

In [None]:
# set datatypes for all_URL_df all columns
all_URL_df['Movie Name'] = all_URL_df['Movie Name'].astype('string')
all_URL_df['Genre'] = all_URL_df['Genre'].astype('string')
all_URL_df['Rating'] = all_URL_df['Rating'].astype('float')
all_URL_df['Release Date'] = all_URL_df['Release Date'].astype('int')
all_URL_df['isAdult'] = all_URL_df['isAdult'].astype('int')
all_URL_df['Runtime Minutes'] = all_URL_df['Runtime Minutes'].astype('float')
all_URL_df['numVotes'] = all_URL_df['numVotes'].astype('float')
all_URL_df['User Reviews URL'] = all_URL_df['User Reviews URL'].astype('string')

In [None]:
# export all_URL_df to a csv file
all_URL_df.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/all_URL_df.csv', index=False)

In [None]:
# Divide the all_URL_df into 40 chunks and name each dataframe as imdb_movies_df_i_url where i is the chunk number and export them as individual csv files to the folder location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL'
chunk_size = 39
remaining_rows = 1568 % 40

for i in range(40):
    start = i * chunk_size
    end = start + chunk_size
    if i == 39:  # Last chunk
        end += remaining_rows
    df = all_URL_df.iloc[start:end]
    df.to_csv(f'{directory}/imdb_movies_df_{i + 1}_url.csv', index=False)

-----

## Loading all 40 URL csv as Dataframes

In [None]:
# load all 20 csv from the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL' as individual dataframes with the same name as their file name but without the .csv extension

import os
import pandas as pd

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_URL'

# Initialize an empty dictionary to store the dataframes
dataframes = {}

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Remove the '.csv' extension and use the filename as the dictionary key
        dataframe_name = filename.replace('.csv', '')
        # Store the DataFrame in the dictionary
        dataframes[dataframe_name] = df

In [None]:
# Defining all dataframes as variables
for i in range(1, 41):
    globals()[f"imdb_movies_df_{i}_url"] = dataframes[f"imdb_movies_df_{i}_url"]

# Scrapping IMDb website code
-----

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import re
import traceback
import os

# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)

def scrape_reviews(url):
    if not url:  # Check if the URL is None or empty and skip if true
        return []
    driver.get(url)
    reviews_data = []
    scraped_reviews = set()  # To track scraped reviews and avoid duplicates
    
    try:
        while len(reviews_data) < 1000:  # Limit to 1000 reviews
            wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#main > section > div.lister > div.lister-list")))
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            review_elements = soup.select("#main > section > div.lister > div.lister-list > div")
            if not review_elements:
                break
            for review_element in review_elements:
                if len(reviews_data) >= 1000:  # Check if we have already collected 1000 reviews
                    break
                try:
                    individual_rating_element = review_element.select_one(".ipl-ratings-bar > span > span:nth-child(2)")
                    individual_rating = int(individual_rating_element.text.strip()) if individual_rating_element else None
                    review_data = {
                        'Review Date': review_element.select_one(".review-date").text.strip(),
                        'Review Title': review_element.select_one(".title").text.strip(),
                        'Username': review_element.select_one(".display-name-link > a").text.strip(),
                        'Helpfulness': review_element.select_one(".actions.text-muted").text.strip(),
                        'Helpful Votes': int(re.findall(r'(\d+)', review_element.select_one(".actions.text-muted").text.strip())[0]),
                        'Total Votes': int(re.findall(r'(\d+)', review_element.select_one(".actions.text-muted").text.strip())[1]),
                        'Individual Rating': individual_rating,
                        'Review Text': review_element.select_one(".text.show-more__control").text.strip(),
                        'Spoiler Warning': review_element.select_one(".spoiler-warning").text.strip() if review_element.select_one(".spoiler-warning") else ""
                    }
                    review_identifier = (review_data['Review Date'], review_data['Username'])
                    if review_identifier not in scraped_reviews:
                        reviews_data.append(review_data)
                        scraped_reviews.add(review_identifier)
                except Exception as e:
                    print(f"Error occurred while extracting review data: {e}")
                    traceback.print_exc()
            try:
                load_more_button = wait.until(EC.element_to_be_clickable((By.ID, "load-more-trigger")))
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)
                load_more_button.click()
                time.sleep(3)  # Adjust sleep time as necessary
            except TimeoutException:
                print("No more reviews to load.")
                break
    except Exception as e:
        print(f"Error occurred: {e}")
        traceback.print_exc()
    finally:
        return reviews_data

# Directory to save CSV files
save_dir = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_Scrapped_data'

# Main loop to iterate through dataframes
for i in range(26, 41):  # Adjust range as necessary
    df_name = f'imdb_movies_df_{i}_url'
    df = eval(df_name)  # Dynamically get the dataframe
    all_reviews = []
    for index, row in df.iterrows():
        if pd.notnull(row['User Reviews URL']):  # Check if the URL is not null
            movie_reviews = scrape_reviews(row['User Reviews URL'])
            for review in movie_reviews:
                movie_data = row.to_dict()
                movie_data['Rating'] = movie_data.get('Rating')  # Ensure the 'Rating' key is from the dataframe
                review.update(movie_data)  # Add movie data to each review
            all_reviews.extend(movie_reviews)
    
    # Convert to DataFrame
    final_df = pd.DataFrame(all_reviews)
    
    # Save to CSV
    csv_file_path = os.path.join(save_dir, f'all_scrapped_data_{i}.csv')
    final_df.to_csv(csv_file_path, index=False)
    print(f"Dataframe {i} reviews saved to {csv_file_path}")

driver.quit()

# Importing all 40 scrapped CSV's and concatenating them
-----

In [None]:
# Import all 20 csv files from the folder location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_Scrapped_data' The files are named all_scrapped_data_1.csv, all_scrapped_data_2.csv, all_scrapped_data_3.csv, and so on till 20. I want to concatenate all these files into one dataframe called all_reviews_df. First merge the first 2 then clear the memory and then merge the next one with the just merged one and so on till all 20 are merged.

import os
import pandas as pd

# Define the directory where the CSV files are located
directory = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/20_movies_datasets_with_Scrapped_data'

# Initialize an empty list to store the dataframes
dataframes = []

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames in the list
all_reviews_df = pd.concat(dataframes, ignore_index=True)

In [None]:
all_reviews_df.head(2)

In [None]:
all_reviews_df.shape

In [None]:
# Export all_reviews_df to a csv file
all_reviews_df.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/all_reviews_df.csv', index=False)

# Importing the CSV and doing further addition of data and cleaning
-----

In [None]:
import pandas as pd

# Load the all_reviews_df
all_reviews_df = pd.read_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/all_reviews_df.csv')

In [None]:
# Number of all unique values in the Movie name column in all_reviews_df
all_reviews_df['Movie Name'].nunique()

In [None]:
# How many times a value appears in the Movie name column in all_reviews_df
repeats = all_reviews_df['Movie Name'].value_counts()

repeats.head(25)

# Cleaning the all_reviews_df dataset of movies less than 100 reviews
--------

In [None]:
# count all movies in a new dataframe that have less than 1000 reviews
less_than_100 = all_reviews_df['Movie Name'].value_counts()
less_than_100 = less_than_100[less_than_100 < 100]
less_than_100.shape

In [None]:
# List of all the movies that have less than 1000 reviews.
less_than_100

In [None]:
less_than_100.shape

In [None]:
all_reviews_df.columns

In [None]:
# Drop all movies that have less than 1000 reviews from all_reviews_df
all_reviews_df = all_reviews_df[~all_reviews_df['Movie Name'].isin(less_than_100.index)]

In [None]:
all_reviews_df.shape

In [None]:
all_reviews_df.tail(2)

In [None]:
# Drop the Helpfulness column from all_reviews_df
all_reviews_df.drop('Helpfulness', axis=1, inplace=True)

------

## Adding additional columns from IMDb dataset

In [None]:
import pandas as pd

# Define file paths for the datasets
basics_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb title basics.tsv'
ratings_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb Title Ratings.tsv'
akas_path = '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/Data/IMDb Title AKAs.tsv'

# Load the datasets
basics_df = pd.read_csv(basics_path, sep='\t', usecols=['tconst', 'titleType', 'primaryTitle', 'genres', 'isAdult', 'startYear', 'runtimeMinutes'])
ratings_df = pd.read_csv(ratings_path, sep='\t', usecols=['tconst', 'averageRating', 'numVotes'])
akas_df = pd.read_csv(akas_path, sep='\t', usecols=['titleId', 'region', 'language', 'types'])

# Rename 'titleId' to 'tconst' in akas_df for consistency
akas_df.rename(columns={'titleId': 'tconst'}, inplace=True)

# Merge the DataFrames on 'tconst'
merged_df = basics_df.merge(ratings_df, on='tconst', how='left')
merged_df = merged_df.merge(akas_df, on='tconst', how='left')

# Filter the types column for movies only
merged_df = merged_df[merged_df['titleType'] == 'movie']

# Convert the 'startYear' column to numeric
merged_df['startYear'] = pd.to_numeric(merged_df['startYear'], errors='coerce')

# drop all null values in numVotes in merged_df
merged_df = merged_df[merged_df['numVotes'] != '\\N']

# Convert the 'numVotes' column to strings
merged_df['numVotes'] = merged_df['numVotes'].astype(str)

# Group the data by 'primaryTitle', 'numVotes', and 'startYear' and concatenate the values in the other columns
merged_df = merged_df.groupby(['primaryTitle', 'numVotes', 'startYear']).agg({
    'region': lambda x: ','.join(x.astype(str)),
    'language': lambda x: ','.join(x.astype(str)),
    'types': lambda x: ','.join(x.astype(str)),
}).reset_index()

# Rename primaryTitle to Movie Name
merged_df.rename(columns={'primaryTitle': 'Movie Name'}, inplace=True)

# Rename startYear to Release Date
merged_df.rename(columns={'startYear': 'Release Date'}, inplace=True)

In [None]:
merged_df.shape

In [None]:
merged_df

In [None]:
# data types for movie_name, numVotes, Release Date in merged_df
merged_df.loc[:, 'Movie Name'] = merged_df['Movie Name'].astype(str)
merged_df.loc[:, 'numVotes'] = pd.to_numeric(merged_df['numVotes'], errors='coerce').astype(pd.Int64Dtype())
merged_df.loc[:, 'Release Date'] = pd.to_numeric(merged_df['Release Date'], errors='coerce').astype(pd.Int64Dtype())

In [None]:
# set data types for movie_name, numVotes, Release Date in all_reviews_df
all_reviews_df.loc[:, 'Movie Name'] = all_reviews_df['Movie Name'].astype(str)
all_reviews_df.loc[:, 'numVotes'] = all_reviews_df['numVotes'].astype(float).astype(int)
all_reviews_df.loc[:, 'Release Date'] = all_reviews_df['Release Date'].astype(float).astype(int)

In [None]:
# # Merge the DataFrames merged_df and all_reviews_df if the Movie name, numVotes and Release Date are the same in both DataFrames then bring the region, language, types and isOriginalTitle from the merged_df to all_reviews_df
# merged_df = all_reviews_df.merge(merged_df, on=['Movie Name', 'numVotes', 'Release Date'], how='left')

merged_df = all_reviews_df.merge(merged_df[['Movie Name', 'numVotes', 'Release Date', 'region', 'language', 'types']], 
                                        on=['Movie Name', 'numVotes', 'Release Date'], 
                                        how='left')

# Display the final table
merged_df.head(5)

In [None]:
merged_df.shape

-------

# Creating dummy variables
----

In [None]:
# In the merged_df dataframe create dummy variables for the region column in the merged_df dataframe. the regions are separated by commas. The dummy variables should be 1 if the region is present and 0 if not. The dummy variables should be named after the region with the prefix 'region_'.

# Split the 'region' column by commas and expand the split strings into separate columns
region_dummies = merged_df['region'].str.get_dummies(sep=',')
# Add a prefix to the column names
region_dummies = region_dummies.add_prefix('region_')

# Concatenate the dummy variables to the original DataFrame
merged_df = pd.concat([merged_df, region_dummies], axis=1)

In [None]:
# In the merged_df dataframe create dummy variables for the language column in the merged_df dataframe. the languages are separated by commas. The dummy variables should be 1 if the language is present and 0 if not. The dummy variables should be named after the language with the prefix 'language_'.

# Split the 'language' column by commas and expand the split strings into separate columns
language_dummies = merged_df['language'].str.get_dummies(sep=',')
# Add a prefix to the column names
language_dummies = language_dummies.add_prefix('language_')

# Concatenate the dummy variables to the original DataFrame
merged_df = pd.concat([merged_df, language_dummies], axis=1)

In [None]:
# In the merged_df dataframe create dummy variables for the language column in the merged_df dataframe. the languages are separated by commas. The dummy variables should be 1 if the language is present and 0 if not. The dummy variables should be named after the language with the prefix 'language_'.

# Split the 'language' column by commas and expand the split strings into separate columns
types_dummies = merged_df['types'].str.get_dummies(sep=',')
# Add a prefix to the column names
types_dummies = types_dummies.add_prefix('types_')

# Concatenate the dummy variables to the original DataFrame
merged_df = pd.concat([merged_df, types_dummies], axis=1)

In [None]:
# In the merged_df dataframe create dummy variables for the language column in the merged_df dataframe. the languages are separated by commas. The dummy variables should be 1 if the language is present and 0 if not. The dummy variables should be named after the language with the prefix 'language_'.

# Split the 'language' column by commas and expand the split strings into separate columns
Genre_dummies = merged_df['Genre'].str.get_dummies(sep=',')
# Add a prefix to the column names
Genre_dummies = Genre_dummies.add_prefix('Genre_')

# Concatenate the dummy variables to the original DataFrame
merged_df = pd.concat([merged_df, Genre_dummies], axis=1)

In [None]:
# Create a dummy variable for the 'Spoiler Warning' column in the merged_df dataframe. The dummy variable should be 1 if the 'Spoiler Warning' column has the value 'Warning: Spoilers' else it will be 0.

# Create a dummy variable for the 'Spoiler Warning' column
merged_df['Spoiler Warning'] = (merged_df['Spoiler Warning'] == 'Warning: Spoilers').astype(int)

In [None]:
merged_df.shape

----

### Additional null value cleaning in the dataset

In [None]:
# Drop the following columns User Reviews URL, region, language, types
merged_df.drop(['User Reviews URL', 'region', 'language', 'types'], axis=1, inplace=True)

In [None]:
# Drop all rows with the missing value in Review Title and Individual Rating columns in the merged_df dataframe
merged_df = merged_df[merged_df['Review Title'].notnull()]
merged_df = merged_df[merged_df['Individual Rating'].notnull()]

In [None]:
merged_df.shape

-----

### Removing all rows with the same Review Date, Review Title, Username, Review Text, Spoiler Warning, Movie Name, numVotes, Release Date, Rating, Runtime Minutes, isAdult, and Genre

In [None]:
# in merged_df if there are rows with the same Review Date, Review Title, Username, Review Text, Spoiler Warning, Movie Name, numVotes, Release Date, Rating, Runtime Minutes, isAdult, and Genre then remove all but keep one out of them.

# Remove all rows with the same Review Date, Review Title, Username, Review Text, Spoiler Warning, Movie Name, numVotes, Release Date, Rating, Runtime Minutes, isAdult, and Genre
merged_df = merged_df.drop_duplicates(subset=['Review Date', 'Review Title', 'Username', 'Review Text', 'Movie Name', 'numVotes', 'Release Date', 'Rating', 'Runtime Minutes'])

# Reset index
merged_df.reset_index(drop=True, inplace=True)

In [None]:
merged_df.shape

----

In [None]:
# export the all_reviews_df dataframe to a csv file in the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/all_reviews_df.csv'
merged_df.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/merged_df.csv', index=False)

In [None]:
# Clear all variables and memory
%reset -f

# Final dataframe is called **Movie_reviews_df**
---------

In [2]:
import pandas as pd

# load all_reviews_df.csv into a dataframe called all_reviews_df
all_final_reviews_df = pd.read_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/merged_df.csv')

In [3]:
all_final_reviews_df.head(2)

Unnamed: 0,Review Date,Review Title,Username,Helpful Votes,Total Votes,Individual Rating,Review Text,Spoiler Warning,Movie Name,Genre,...,Genre_Music,Genre_Musical,Genre_Mystery,Genre_News,Genre_Romance,Genre_Sci-Fi,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western
0,15 February 2023,I Don't Understand The Negative Reviews,varun-25071997,104,119,9.0,I don't know much about depression but I do kn...,0,The Son,Drama,...,0,0,0,0,0,0,0,0,0,0
1,16 February 2023,A teenager's psychological trauma and the cons...,madanmarwah,51,59,7.0,Just a bit of research into the works of write...,0,The Son,Drama,...,0,0,0,0,0,0,0,0,0,0


In [4]:
all_final_reviews_df.shape

(852981, 229)

# Vander Sentiment Score calculation
-----

### Sentiment score for Review Title

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
def get_vader_sentiment_score(review):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(review)
    return sentiment_scores['compound']

# Apply the VADER sentiment analysis function to the 'Review Title' column in parallel
all_final_reviews_df['Sentiment_score__Review_title'] = Parallel(n_jobs=-1)(delayed(get_vader_sentiment_score)(review) for review in all_final_reviews_df['Review Title'])

# Calculate the Mean Squared Error (MSE) between VADER sentiment scores and individual ratings
mse = mean_squared_error(all_final_reviews_df['Individual Rating'], all_final_reviews_df['Sentiment_score__Review_title'])
print("Mean Squared Error (MSE) using VADER:", mse)

### Sentiment score for Review Text

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
def get_vader_sentiment_score(review):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(review)
    return sentiment_scores['compound']

# Apply the VADER sentiment analysis function to the 'Review Title' column in parallel
all_final_reviews_df['Sentiment_score__Review_text'] = Parallel(n_jobs=-1)(delayed(get_vader_sentiment_score)(review) for review in all_final_reviews_df['Review Text'])

# Calculate the Mean Squared Error (MSE) between VADER sentiment scores and individual ratings
mse = mean_squared_error(all_final_reviews_df['Individual Rating'], all_final_reviews_df['Sentiment_score__Review_text'])
print("Mean Squared Error (MSE) using VADER:", mse)

Mean Squared Error (MSE) using VADER: 42.7677477563978


In [7]:
all_final_reviews_df

Unnamed: 0,Review Date,Review Title,Username,Helpful Votes,Total Votes,Individual Rating,Review Text,Spoiler Warning,Movie Name,Genre,...,Genre_Mystery,Genre_News,Genre_Romance,Genre_Sci-Fi,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Sentiment_score__Review_title,Sentiment_score__Review_text
0,15 February 2023,I Don't Understand The Negative Reviews,varun-25071997,104,119,9.0,I don't know much about depression but I do kn...,0,The Son,Drama,...,0,0,0,0,0,0,0,0,0.4585,-0.6966
1,16 February 2023,A teenager's psychological trauma and the cons...,madanmarwah,51,59,7.0,Just a bit of research into the works of write...,0,The Son,Drama,...,0,0,0,0,0,0,0,0,-0.4215,-0.9495
2,15 October 2022,The Son is an emotionally devastating film wit...,msbreviews,95,121,8.0,"If you enjoy reading my Spoiler-Free thoughts,...",1,The Son,Drama,...,0,0,0,0,0,0,0,0,-0.7351,0.3612
3,29 March 2023,Disturbing and profound,magnuslhad,45,53,9.0,The Son is every parent's nightmare. Reminisce...,0,The Son,Drama,...,0,0,0,0,0,0,0,0,-0.5106,0.8555
4,27 May 2023,Extraordinary!,acec-29548,67,76,10.0,Not all movies are meant to entertain. Some mo...,0,The Son,Drama,...,0,0,0,0,0,0,0,0,0.0000,-0.8035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852976,19 January 2023,"Everyone got it wrong? It's a Metaphor, a cyni...",agelos-pikoulas,1,11,8.0,I'm surprised none has called the great Metaph...,1,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0,-0.8625,-0.5159
852977,3 January 2023,"Brilliant director, story, and actors.",byronduquettea,2,6,8.0,Martin Mcdonagh directs yet another brilliant ...,0,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0,0.5859,0.7717
852978,25 January 2023,Your single act of stubboness change the world.,Fivealin,2,8,10.0,This is a gut wrenching film.This is the sadde...,1,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0,0.0000,-0.6728
852979,11 January 2023,"Shush England, not Shush Padraic",iHRhabibur,3,9,8.0,"Not Shush Padraic, it's more of a shush Englan...",0,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0,0.0000,-0.3416


In [8]:
all_final_reviews_df.columns

Index(['Review Date', 'Review Title', 'Username', 'Helpful Votes',
       'Movie Name', 'Genre',
       ...
       'Genre_Mystery', 'Genre_News', 'Genre_Romance', 'Genre_Sci-Fi',
       'Genre_Sport', 'Genre_Thriller', 'Genre_War', 'Genre_Western',
       'Sentiment_score__Review_title', 'Sentiment_score__Review_text'],
      dtype='object', length=231)

In [9]:
# export all_final_reviews_df to a csv file in the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/merged_df_after_sentiment_score.csv'
all_final_reviews_df.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/merged_df_after_sentiment_score.csv', index=False)

# Getting YYYY/MM/DD release dates for the movies
-------

In [38]:
# Clear all variables and memory
%reset -f

In [39]:
# Correct the import statements (only need to import pandas once)
import pandas as pd

# Load the dataframe from a CSV file
all_final_reviews_df = pd.read_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/merged_df_after_sentiment_score.csv')

In [40]:
all_final_reviews_df.shape

(852981, 231)

In [13]:
all_final_reviews_df_sample = all_final_reviews_df.copy()

In [14]:
# drop all duplicate rows in all_final_reviews_df_sample where the movie name and the release date are the same and keep the first row.
all_final_reviews_df_sample = all_final_reviews_df_sample.drop_duplicates(subset=['Movie Name', 'Release Date'], keep='first')

In [15]:
# in all_final_reviews_df_sample drop all columns except the Movie name and the release date
all_final_reviews_df_sample = all_final_reviews_df_sample[['Movie Name', 'Release Date']]

# combine the Movie name and the release date columns in all_final_reviews_df_sample into a new column called 'Movie Name and Release Date'
all_final_reviews_df_sample['Movie_with_release'] = all_final_reviews_df_sample['Movie Name'] + ' (' + all_final_reviews_df_sample['Release Date'].astype(str) + ')'

In [16]:
all_final_reviews_df_sample

Unnamed: 0,Movie Name,Release Date,Movie_with_release
0,The Son,2022,The Son (2022)
152,The Girl on the Train,2021,The Girl on the Train (2021)
1142,Danger Close,2019,Danger Close (2019)
1413,Elvis & Nixon,2016,Elvis & Nixon (2016)
1514,Armageddon Time,2022,Armageddon Time (2022)
...,...,...,...
850868,Umma,2022,Umma (2022)
851005,Angel of Mine,2019,Angel of Mine (2019)
851124,Brahms: The Boy II,2020,Brahms: The Boy II (2020)
851428,The Girl on the Train,2016,The Girl on the Train (2016)


In [19]:
import pandas as pd
from imdb import IMDb
import re

# Create an instance of the IMDb class
ia = IMDb()

# Function to get the release date
def get_release_date(movie_with_release):
    try:
        # Extract movie name and year from the 'Movie_with_release' column
        movie_name, movie_year = movie_with_release.rsplit(' (', 1)
        movie_year = movie_year.rstrip(')')
        
        # Search for the movie by name
        search_results = ia.search_movie(movie_name)
        
        # Filter search results by year and get the first match
        filtered_results = [result for result in search_results if str(result.get('year')) == movie_year]
        
        if filtered_results:
            # Get movie details using the movie ID of the first search result
            movie_id = filtered_results[0].movieID
            movie = ia.get_movie(movie_id)
            
            # Attempt to get the release date
            if 'original air date' in movie.keys():
                release_date = movie['original air date']
                # Use regex to extract the date part before the country code
                match = re.search(r'(\d{1,2} \w{3} \d{4})', release_date)
                if match:
                    # Extract the date from the regex match group
                    release_date = match.group(1)
                    # Format the release date
                    return pd.to_datetime(release_date).strftime('%Y/%m/%d')
    except Exception as e:
        print(f"Error retrieving release date for {movie_with_release}: {e}")
    return None

# Assuming all_final_reviews_df_sample is your dataframe and it's already loaded
# Apply the function to each row in the 'Movie_with_release' column
all_final_reviews_df_sample['AI_Release_Date'] = all_final_reviews_df_sample['Movie_with_release'].apply(get_release_date)

# Display the dataframe to verify the new column
print(all_final_reviews_df_sample.head())

2024-03-29 21:18:04,134 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt11252440/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 52

Error retrieving release date for Psycho Goreman (2020): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt11252440/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 21:19:03,028 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt7392212/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Rangasthalam 1985 (2018): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt7392212/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 21:27:51,212 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5726616/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Call Me by Your Name (2017): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5726616/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 21:29:31,543 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt17076046/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 52

Error retrieving release date for Weird: The Al Yankovic Story (2022): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt17076046/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 21:49:18,844 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt4144190/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, 

Error retrieving release date for Wiener-Dog (2016): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt4144190/reference', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 21:51:16,306 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Chaman+Bahaar&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517,

Error retrieving release date for Chaman Bahaar (2020): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Chaman+Bahaar&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:00:07,813 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=21+Bridges&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, in

Error retrieving release date for 21 Bridges (2019): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=21+Bridges&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:03:24,263 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Pain+and+Glory&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517

Error retrieving release date for Pain and Glory (2019): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Pain+and+Glory&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:06:22,912 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt7137380/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Destroyer (2018): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt7137380/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:12:06,615 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Covenant&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, 

Error retrieving release date for The Covenant (2023): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Covenant&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:21:07,885 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Where+the+Crawdads+Sing&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py",

Error retrieving release date for Where the Crawdads Sing (2022): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Where+the+Crawdads+Sing&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:23:04,663 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2823054/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Mike and Dave Need Wedding Dates (2016): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2823054/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:23:39,081 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5028340/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Mary Poppins Returns (2018): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5028340/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:28:45,226 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt3104988/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Crazy Rich Asians (2018): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt3104988/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:33:14,081 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2380307/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Coco (2017): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt2380307/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:38:00,826 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5151570/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Mrs. Harris Goes to Paris (2022): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5151570/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:42:09,872 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Drishyam+2&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, in

Error retrieving release date for Drishyam 2 (2021): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Drishyam+2&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:46:04,569 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5164214/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Ocean's Eight (2018): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt5164214/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:48:57,719 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt8633462/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Quo Vadis, Aida? (2020): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt8633462/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 22:59:06,252 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt11541872/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 52

Error retrieving release date for Big Bug (2022): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt11541872/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:06:43,367 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt4685762/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for The Craft: Legacy (2020): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt4685762/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:07:39,113 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt11762434/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 52

Error retrieving release date for Cosmic Sin (2021): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt11762434/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:10:27,012 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt4225622/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for The Babysitter (2017): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt4225622/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:11:03,823 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Thank+God&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, in 

Error retrieving release date for Thank God (2022): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Thank+God&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:12:23,742 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Fantastic+Beasts+and+Where+to+Find+Them&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/url

Error retrieving release date for Fantastic Beasts and Where to Find Them (2016): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Fantastic+Beasts+and+Where+to+Find+Them&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:14:04,500 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Napoleon&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, in o

Error retrieving release date for Napoleon (2023): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Napoleon&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:19:38,129 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Sairat&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517, in ope

Error retrieving release date for Sairat (2016): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Sairat&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:24:03,936 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Billy+Lynn%27s+Long+Halftime+Walk&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/re

Error retrieving release date for Billy Lynn's Long Halftime Walk (2016): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Billy+Lynn%27s+Long+Halftime+Walk&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:25:30,024 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt1801552/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for Gotti (2018): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt1801552/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:26:47,973 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt10276470/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 52

Error retrieving release date for Work It (2020): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt10276470/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:27:36,698 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Boy+and+the+Heron&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", l

Error retrieving release date for The Boy and the Heron (2023): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Boy+and+the+Heron&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:29:23,064 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt13234058/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 52

Error retrieving release date for My Son (2021): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt13234058/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:31:29,794 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Dark+Tower&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 517

Error retrieving release date for The Dark Tower (2017): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=The+Dark+Tower&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:34:44,860 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt3416532/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for A Monster Calls (2016): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt3416532/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}


2024-03-29 23:41:02,190 CRITICAL [imdbpy] /Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt8461224/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/myenv/lib/python3.9/site-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/urllib/request.py", line 523

Error retrieving release date for The Tax Collector (2020): {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/title/tt8461224/plotsummary', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')}
                 Movie Name  Release Date            Movie_with_release  \
0                   The Son          2022                The Son (2022)   
152   The Girl on the Train          2021  The Girl on the Train (2021)   
1142           Danger Close          2019           Danger Close (2019)   
1413          Elvis & Nixon          2016          Elvis & Nixon (2016)   
1514        Armageddon Time          2022        Armageddon Time (2022)   

     AI_Release_Date  
0         2023/01/20  
152       2021/02/26  
1142      2019/11/08  
1413      2016/04/21  
1514      2022/11/04  


In [23]:
all_final_reviews_df_sample.shape

(1823, 3)

In [24]:
all_final_reviews_df_sample.head(2)

Unnamed: 0,Movie Name,Release Date,AI_Release_Date
0,The Son,2022,2023/01/20
152,The Girl on the Train,2021,2021/02/26


In [25]:
# export all_final_reviews_df_sample to a csv file in the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/all_final_reviews_df_release_dates.csv'
all_final_reviews_df_sample.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/all_final_reviews_df_release_dates.csv', index=False)

### Merging the Release Dates to the Main Dataset

In [76]:
# load 'all_final_reviews_df_release_dates.csv' and 'merged_df_after_sentiment_score.csv' into dataframes called all_final_reviews_df_release_dates and merged_df_after_sentiment_score
all_final_reviews_df_release_dates = pd.read_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/df.csv')
merged_df_after_sentiment_score = pd.read_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/merged_df_after_sentiment_score.csv')

In [77]:
# all null values in each column in all_final_reviews_df_release_dates
all_final_reviews_df_release_dates.isnull().sum()

Movie Name         0
Release Date       0
AI_Release_Date    0
dtype: int64

In [78]:
all_final_reviews_df_release_dates.shape

(1822, 3)

In [79]:
all_final_reviews_df_release_dates

Unnamed: 0,Movie Name,Release Date,AI_Release_Date
0,The Son,2022,1/20/23
1,The Girl on the Train,2021,2/26/21
2,Danger Close,2019,11/8/19
3,Elvis & Nixon,2016,4/21/16
4,Armageddon Time,2022,11/4/22
...,...,...,...
1817,Umma,2022,3/18/22
1818,Angel of Mine,2019,8/30/19
1819,Brahms: The Boy II,2020,2/21/20
1820,The Girl on the Train,2016,10/7/16


In [80]:
# change the date format of AI_Release_Date to YYYY/MM/DD in all_final_reviews_df_release_dates
all_final_reviews_df_release_dates['AI_Release_Date'] = pd.to_datetime(all_final_reviews_df_release_dates['AI_Release_Date']).dt.strftime('%Y/%m/%d')

  all_final_reviews_df_release_dates['AI_Release_Date'] = pd.to_datetime(all_final_reviews_df_release_dates['AI_Release_Date']).dt.strftime('%Y/%m/%d')


In [81]:
# export all_final_reviews_df_release_dates as a csv file as all_final_reviews_df_release_dates.csv
all_final_reviews_df_release_dates.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/all_final_reviews_df_release_dates.csv', index=False)

In [82]:
merged_df_after_sentiment_score.shape

(852981, 231)

In [83]:
# Merge the dataframes all_final_reviews_df_release_dates and merged_df_after_sentiment_score on the 'Movie Name' and 'Release Date' columns. Bring the AI_Release_Date column from all_final_reviews_df_release_dates to merged_df_after_sentiment_score. There will be duplicate columns for the 'Movie Name' and 'Release Date' columns in the merged dataframe so duplicate values if the same.

# Merge the dataframes on 'Movie Name' and 'Release Date'
final_merged_df = merged_df_after_sentiment_score.merge(all_final_reviews_df_release_dates, on=['Movie Name', 'Release Date'], how='left')

In [84]:
# Display the final table
final_merged_df.head(2)

Unnamed: 0,Review Date,Review Title,Username,Helpful Votes,Total Votes,Individual Rating,Review Text,Spoiler Warning,Movie Name,Genre,...,Genre_News,Genre_Romance,Genre_Sci-Fi,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Sentiment_score__Review_title,Sentiment_score__Review_text,AI_Release_Date
0,15 February 2023,I Don't Understand The Negative Reviews,varun-25071997,104,119,9.0,I don't know much about depression but I do kn...,0,The Son,Drama,...,0,0,0,0,0,0,0,0.4585,-0.6966,2023/01/20
1,16 February 2023,A teenager's psychological trauma and the cons...,madanmarwah,51,59,7.0,Just a bit of research into the works of write...,0,The Son,Drama,...,0,0,0,0,0,0,0,-0.4215,-0.9495,2023/01/20


In [85]:
# drop all null value rows in the AI_Release_Date column in final_merged_df
final_merged_df = final_merged_df[final_merged_df['AI_Release_Date'].notnull()]

In [86]:
# count all null values in each column in final_merged_df
final_merged_df.isnull().sum()

Review Date                      0
Review Title                     0
Username                         0
Helpful Votes                    0
Total Votes                      0
                                ..
Genre_War                        0
Genre_Western                    0
Sentiment_score__Review_title    0
Sentiment_score__Review_text     0
AI_Release_Date                  0
Length: 232, dtype: int64

# Need to create a new column that calculates the 'Days since'
-----

In [87]:
# set data type to AI_Release_Date in final_merged_df and Review Date in final_merged_df
final_merged_df['AI_Release_Date'] = pd.to_datetime(final_merged_df['AI_Release_Date'])
final_merged_df['Review Date'] = pd.to_datetime(final_merged_df['Review Date'])

In [88]:
# create a new column called 'Days_since' in the final_merged_df dataframe which is the difference between the 'AI_Release_Date' and the 'Review Date' columns in days.
final_merged_df['Days_since'] = (pd.to_datetime(final_merged_df['Review Date']) - pd.to_datetime(final_merged_df['AI_Release_Date'])).dt.days

In [89]:
final_merged_df.shape

(851423, 233)

In [90]:
final_merged_df

Unnamed: 0,Review Date,Review Title,Username,Helpful Votes,Total Votes,Individual Rating,Review Text,Spoiler Warning,Movie Name,Genre,...,Genre_Romance,Genre_Sci-Fi,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Sentiment_score__Review_title,Sentiment_score__Review_text,AI_Release_Date,Days_since
0,2023-02-15,I Don't Understand The Negative Reviews,varun-25071997,104,119,9.0,I don't know much about depression but I do kn...,0,The Son,Drama,...,0,0,0,0,0,0,0.4585,-0.6966,2023-01-20,26
1,2023-02-16,A teenager's psychological trauma and the cons...,madanmarwah,51,59,7.0,Just a bit of research into the works of write...,0,The Son,Drama,...,0,0,0,0,0,0,-0.4215,-0.9495,2023-01-20,27
2,2022-10-15,The Son is an emotionally devastating film wit...,msbreviews,95,121,8.0,"If you enjoy reading my Spoiler-Free thoughts,...",1,The Son,Drama,...,0,0,0,0,0,0,-0.7351,0.3612,2023-01-20,-97
3,2023-03-29,Disturbing and profound,magnuslhad,45,53,9.0,The Son is every parent's nightmare. Reminisce...,0,The Son,Drama,...,0,0,0,0,0,0,-0.5106,0.8555,2023-01-20,68
4,2023-05-27,Extraordinary!,acec-29548,67,76,10.0,Not all movies are meant to entertain. Some mo...,0,The Son,Drama,...,0,0,0,0,0,0,0.0000,-0.8035,2023-01-20,127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852976,2023-01-19,"Everyone got it wrong? It's a Metaphor, a cyni...",agelos-pikoulas,1,11,8.0,I'm surprised none has called the great Metaph...,1,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,-0.8625,-0.5159,2022-11-04,76
852977,2023-01-03,"Brilliant director, story, and actors.",byronduquettea,2,6,8.0,Martin Mcdonagh directs yet another brilliant ...,0,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0.5859,0.7717,2022-11-04,60
852978,2023-01-25,Your single act of stubboness change the world.,Fivealin,2,8,10.0,This is a gut wrenching film.This is the sadde...,1,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0.0000,-0.6728,2022-11-04,82
852979,2023-01-11,"Shush England, not Shush Padraic",iHRhabibur,3,9,8.0,"Not Shush Padraic, it's more of a shush Englan...",0,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0.0000,-0.3416,2022-11-04,68


# Creating Predicted Rating column
-----

In [98]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler

# Assuming all_final_reviews_df is your dataframe
# First, separate your features (X) and target variable (y)
X = final_merged_df.drop(columns=['Individual Rating', 'Review Title', 'Review Text', 'Review Date', 'Username', 'Movie Name', 'Genre', 'AI_Release_Date'])
y = final_merged_df['Individual Rating']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model on the entire dataset
model.fit(X_scaled, y, epochs=10, batch_size=32)  # Specify the number of epochs and batch size as needed

# Predict the ratings for the entire dataset
y_pred = model.predict(X_scaled)

# Add the predictions back to the dataframe
final_merged_df['Predicted Rating'] = y_pred.flatten()

# Display the updated dataframe
print(final_merged_df.head())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 840us/step - loss: 5.6236
Epoch 2/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 769us/step - loss: 4.7848
Epoch 3/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 718us/step - loss: 4.5706
Epoch 4/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 668us/step - loss: 4.4379
Epoch 5/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 753us/step - loss: 4.3478
Epoch 6/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 785us/step - loss: 4.2768
Epoch 7/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 791us/step - loss: 4.2233
Epoch 8/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 741us/step - loss: 4.1603
Epoch 9/10
[1m26607/26607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 708us/step - loss: 4.1177
Epoch 10/10
[1m26607/26607

In [99]:
all_final_reviews_df

Unnamed: 0,Review Date,Review Title,Username,Helpful Votes,Total Votes,Individual Rating,Review Text,Spoiler Warning,Movie Name,Genre,...,Genre_News,Genre_Romance,Genre_Sci-Fi,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Sentiment_score__Review_title,Sentiment_score__Review_text,Predicted Rating
0,15 February 2023,I Don't Understand The Negative Reviews,varun-25071997,104,119,9,I don't know much about depression but I do kn...,0,The Son,Drama,...,0,0,0,0,0,0,0,0.4585,-0.6966,6.899688
1,16 February 2023,A teenager's psychological trauma and the cons...,madanmarwah,51,59,7,Just a bit of research into the works of write...,0,The Son,Drama,...,0,0,0,0,0,0,0,-0.4215,-0.9495,5.840226
2,15 October 2022,The Son is an emotionally devastating film wit...,msbreviews,95,121,8,"If you enjoy reading my Spoiler-Free thoughts,...",1,The Son,Drama,...,0,0,0,0,0,0,0,-0.7351,0.3612,6.340936
3,29 March 2023,Disturbing and profound,magnuslhad,45,53,9,The Son is every parent's nightmare. Reminisce...,0,The Son,Drama,...,0,0,0,0,0,0,0,-0.5106,0.8555,7.562127
4,27 May 2023,Extraordinary!,acec-29548,67,76,10,Not all movies are meant to entertain. Some mo...,0,The Son,Drama,...,0,0,0,0,0,0,0,0.0000,-0.8035,6.683297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852976,19 January 2023,"Everyone got it wrong? It's a Metaphor, a cyni...",agelos-pikoulas,1,11,8,I'm surprised none has called the great Metaph...,1,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,-0.8625,-0.5159,4.029799
852977,3 January 2023,"Brilliant director, story, and actors.",byronduquettea,2,6,8,Martin Mcdonagh directs yet another brilliant ...,0,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0.5859,0.7717,8.132694
852978,25 January 2023,Your single act of stubboness change the world.,Fivealin,2,8,10,This is a gut wrenching film.This is the sadde...,1,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0.0000,-0.6728,5.331083
852979,11 January 2023,"Shush England, not Shush Padraic",iHRhabibur,3,9,8,"Not Shush Padraic, it's more of a shush Englan...",0,The Banshees of Inisherin,"Comedy,Drama",...,0,0,0,0,0,0,0,0.0000,-0.3416,5.037545


In [100]:
all_final_reviews_df.columns

Index(['Review Date', 'Review Title', 'Username', 'Helpful Votes',
       'Movie Name', 'Genre',
       ...
       'Genre_News', 'Genre_Romance', 'Genre_Sci-Fi', 'Genre_Sport',
       'Genre_Thriller', 'Genre_War', 'Genre_Western',
       'Sentiment_score__Review_title', 'Sentiment_score__Review_text',
       'Predicted Rating'],
      dtype='object', length=232)

In [101]:
# export all_final_reviews_df to a csv file in the location '/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/final_with_predicted_rating.csv'
all_final_reviews_df.to_csv('/Users/shreyashgupta/Library/CloudStorage/OneDrive-UniversityofArkansas/MSEA/2nd Semester - MSEA/ECON 5823 - ECONOMIC ANALYTICS II/Project Code file/IMDb_movie_review_sentiment_analysis/final_with_predicted_rating.csv', index=False)