This Notebook imports the list of IMDb URLS and adjusts them so that they pick up the correct reviews URL and seperates the IMDB id from the initial URL. Using the IMDB id, it pulls all the films from the OMDb API and inserts them into Dataframe. Additional clean-up is done, including removing all titles that are not movies. Then, the initial Dataframe and the OMDb Dataframe are merged together before using Beautiful Soup to pull all of the reviews from the URLs. Lastly, the Dataframe is inserted into MongoDB.

In [2]:
#Load dependenices
import pandas as pd
import requests
import pymongo
from bs4 import BeautifulSoup

In [3]:
# Import file containing imdb movie urls
url_df = pd.read_csv('https://charlies-angels.s3.us-east-2.amazonaws.com/movie_urls.txt', names = ["url"])

In [4]:
# Remove duplicates and reset index
url_df = url_df.drop_duplicates()
url_df.reset_index(inplace = True, drop = True)

# Add a column for Reviews
url_df['reviews'] = ''

# Add a column for IMDB Movie ID
url_df['movie_id'] = url_df['url'].str.replace('/usercomments','').str.replace('http://www.imdb.com/title/','')

# Update the URLs to what we want
url_df['url'] = url_df['url'].str.replace('usercomments','reviews')

In [5]:
# Online Movie Database (OMDB) API URL including API Key and full plot
api_url = "http://www.omdbapi.com/?apikey=baee4093&plot=full"

movie_data = []

# Using a for-loop...
for movie_id in url_df['movie_id']:
    # Use the movie_id to pull data from the API
    try:
        movie = requests.get(api_url + '&i=' + movie_id).json()
        if movie['Response'] == "True":
            movie_data.append(movie)
    except JSONDecodeError:
        continue

In [6]:
# Create a pandas Dataframe using movie_data
omdb_df = pd.DataFrame(movie_data)

In [7]:
# Create a copy of the dataframe for cleaner manipulation
movie_only = omdb_df.copy()
# Ensure that only movie titles are being used
movie_only = movie_only[movie_only['Type'] == 'movie']
# Drop any duplicates
movie_only = movie_only.drop_duplicates(subset ="imdbID") 
# Drop any rating with N/A
movie_only = movie_only[movie_only['imdbRating'] != 'N/A']
# Convert the rating column to a numeric one
movie_only['imdbRating'] = pd.to_numeric(movie_only['imdbRating'])

# Check the number of entries
unique_films = movie_only['imdbID'].nunique()
print(f'There are {unique_films} entries in the movie_only dataframe.')

There are 6018 entries in the movie_only dataframe.


In [8]:
# Merge both dataframes together using an inner join 
full_df = pd.merge(url_df, movie_only, left_on='movie_id', right_on='imdbID', how='inner')

#Rename imdbRating to label, since Naive Bayes requires it
full_df.rename(columns={'imdbRating' : 'label_orig'}, inplace=True)
# Clean up the columns to ensure only the columns we need will be used
full_df = full_df[['url', 'reviews', 'movie_id', 'Title', 'Year', 'Genre', 'Actors','Plot', 'Poster', 'label']]

In [9]:
# Retrieve reviews
for idx in range(len(full_df)):
    reviews = ''
    
    response = requests.get(full_df.loc[idx, 'url'])
    soup = BeautifulSoup(response.text, 'lxml')
    results = soup.find_all('div', class_='text show-more__control')
    
    for result in results:
        reviews = reviews + result.text + ' ' 
    
    full_df.loc[idx, ['reviews']] = reviews


In [10]:
# Remove any films with blank reviews
full_df = full_df[full_df['reviews'] != '']

In [None]:
full_df.head()

## In order to run, please uncomment the code below

In [11]:
# Create list of dictionaries in order to efficiently insert into MongoDB
#movies_dict = full_df.to_dict('records')

# MongoDB connection
#conn = 'mongodb+srv://general_user:charli3s_ang3ls@cluster0-tyboh.mongodb.net/movie_db?retryWrites=true&w=majority'
#client = pymongo.MongoClient(conn)

# Declare the collection
#collection = client.movie_db.movie_reviews
#Drop collection if it exists to prevent duplication
#collection.drop()  
# Insert all of the documents into the collection
#collection.insert_many(movies_dict)

<pymongo.results.InsertManyResult at 0x1e53eddd688>