This Notebook is a clean-up of the initial MongoDB collection. It transforms the data types in the dataframe, removes random characters and stopwords from the reviews, and includes a Review Type column which outputs 'positive' if the label_orig (initially the IMDb rating) is greater than 6; else, 'negative'. Once cleaned it then was loaded back into MongoDB for the classification model.

In [3]:
#Load dependenices
import pandas as pd
import pymongo
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import re
nltk.download('stopwords')
import string

In [4]:
# MongoDB connection
conn = 'mongodb+srv://general_user:charli3s_ang3ls@cluster0-tyboh.mongodb.net/movie_db?retryWrites=true&w=majority'
client = pymongo.MongoClient(conn)

# Declare the collection
collection = client.movie_db.movie_reviews.find()

In [5]:
# Load data into Pandas Dataframe
new_df = pd.DataFrame( collection)

In [6]:
# Rename and Change Types
new_df['label_orig'] = pd.to_numeric(new_df['label'])
new_df['_id'] = new_df['_id'].astype(str)
new_df['url'] = new_df['url'].astype(str)
new_df['reviews'] = new_df['reviews'].astype(str)
new_df['movie_id'] = new_df['movie_id'].astype(str)
new_df['Title'] = new_df['Title'].astype(str)
new_df['Year'] = pd.to_numeric(new_df['Year'])
new_df['Genre'] = new_df['Genre'].astype(str)
new_df['Actors'] = new_df['Actors'].astype(str)
new_df['Plot'] = new_df['Plot'].astype(str)
new_df['Poster'] = new_df['Poster'].astype(str)

del new_df['label']

new_df.head()

Unnamed: 0,Actors,Genre,Plot,Poster,Title,Year,_id,movie_id,reviews,url,label_orig
0,"Lavinia Wilson, Barnaby Metschurat, Matthias K...","Drama, Romance",,https://m.media-amazon.com/images/M/MV5BMGJhOD...,Julietta,2001,5dd24aeaa6edc0dcb0d75569,tt0248123,***SPOILERS*** ***SPOILERS*** The basis to t...,http://www.imdb.com/title/tt0248123/reviews,6.2
1,"Tôru Furuya, Mami Koyama, Keiko Han, Michiko N...","Animation, Drama, Family, Fantasy, Romance",Years after the war between the Children of Wa...,https://m.media-amazon.com/images/M/MV5BZGJmND...,Sea Prince and the Fire Child,1981,5dd24aeaa6edc0dcb0d75566,tt0203895,It is a rare event when I rate a movie on the ...,http://www.imdb.com/title/tt0203895/reviews,7.2
2,"Lars Hanson, Marceline Day, Pauline Starke, Er...","Drama, Romance",A returning seminary student goes up against a...,https://m.media-amazon.com/images/M/MV5BMTc3OT...,Captain Salvation,1927,5dd24aeaa6edc0dcb0d75568,tt0017735,The first half to two-thirds of the film worke...,http://www.imdb.com/title/tt0017735/reviews,6.9
3,"Toni Collette, Parker Posey, Lisa Kudrow, Alan...","Comedy, Drama",Iris can best be described as a wallflower. Sh...,https://m.media-amazon.com/images/M/MV5BYzZlND...,Clockwatchers,1997,5dd24aeaa6edc0dcb0d75563,tt0118866,The only thing that I can think when reading t...,http://www.imdb.com/title/tt0118866/reviews,6.5
4,"William Boyd, Mary Astor, Louis Wolheim, Ian K...","Comedy, Romance, Adventure",Americans Sgt. Peter O'Gaffney and one of his ...,https://m.media-amazon.com/images/M/MV5BNTJiNG...,Two Arabian Knights,1927,5dd24aeaa6edc0dcb0d75562,tt0018515,Finally broadcast by Turner Classic Movies on ...,http://www.imdb.com/title/tt0018515/reviews,6.9


In [32]:
# Create a copy of the Dataframe for cleaner manipulation
copynew = new_df.copy()

# Categorize Positive and Negative Reviews
copynew['Review_Type'] = copynew['label_orig'] > 6
copynew['Review_Type'] = ['positive' if x else 'negative' for x in copynew['Review_Type']]

In [33]:
copynew

Unnamed: 0,Actors,Genre,Plot,Poster,Title,Year,_id,movie_id,reviews,url,label_orig,Review_Type,Parsed
0,"Lavinia Wilson, Barnaby Metschurat, Matthias K...","Drama, Romance",,https://m.media-amazon.com/images/M/MV5BMGJhOD...,Julietta,2001,5dd24aeaa6edc0dcb0d75569,tt0248123,***SPOILERS*** ***SPOILERS*** The basis to t...,http://www.imdb.com/title/tt0248123/reviews,6.2,positive,"[spoilers, spoilers, basis, movie, classic, no..."
1,"Tôru Furuya, Mami Koyama, Keiko Han, Michiko N...","Animation, Drama, Family, Fantasy, Romance",Years after the war between the Children of Wa...,https://m.media-amazon.com/images/M/MV5BZGJmND...,Sea Prince and the Fire Child,1981,5dd24aeaa6edc0dcb0d75566,tt0203895,It is a rare event when I rate a movie on the ...,http://www.imdb.com/title/tt0203895/reviews,7.2,positive,"[rare, event, rate, movie, level, 10, could, w..."
2,"Lars Hanson, Marceline Day, Pauline Starke, Er...","Drama, Romance",A returning seminary student goes up against a...,https://m.media-amazon.com/images/M/MV5BMTc3OT...,Captain Salvation,1927,5dd24aeaa6edc0dcb0d75568,tt0017735,The first half to two-thirds of the film worke...,http://www.imdb.com/title/tt0017735/reviews,6.9,positive,"[first, half, twothirds, film, worked, well, w..."
3,"Toni Collette, Parker Posey, Lisa Kudrow, Alan...","Comedy, Drama",Iris can best be described as a wallflower. Sh...,https://m.media-amazon.com/images/M/MV5BYzZlND...,Clockwatchers,1997,5dd24aeaa6edc0dcb0d75563,tt0118866,The only thing that I can think when reading t...,http://www.imdb.com/title/tt0118866/reviews,6.5,positive,"[thing, think, reading, negative, comments, le..."
4,"William Boyd, Mary Astor, Louis Wolheim, Ian K...","Comedy, Romance, Adventure",Americans Sgt. Peter O'Gaffney and one of his ...,https://m.media-amazon.com/images/M/MV5BNTJiNG...,Two Arabian Knights,1927,5dd24aeaa6edc0dcb0d75562,tt0018515,Finally broadcast by Turner Classic Movies on ...,http://www.imdb.com/title/tt0018515/reviews,6.9,positive,"[finally, broadcast, turner, classic, movies, ..."
5,"Tsutomu Takakuwa, Kelly Varis, Katherine Murph...","Action, Adventure, Family, Sci-Fi",A giant creature attacks Japan during the Worl...,https://m.media-amazon.com/images/M/MV5BYjNiZj...,Gamera vs. Monster X,1970,5dd24aeaa6edc0dcb0d7556c,tt0065755,This film is pretty much a formula Gamera film...,http://www.imdb.com/title/tt0065755/reviews,5.6,negative,"[film, pretty, much, formula, gamera, film, fo..."
6,"William Prince, Jim Backus, Christine White, J...","Horror, Thriller",A small-town doctor (William Prince) gets caug...,https://m.media-amazon.com/images/M/MV5BZGJlN2...,Macabre,1958,5dd24aeaa6edc0dcb0d75567,tt0051885,I finally saw this on TV several years ago and...,http://www.imdb.com/title/tt0051885/reviews,5.8,negative,"[finally, saw, tv, several, years, ago, wish, ..."
7,"Mark Taylor, Ted Kennedy, Saddam Hussein, Geor...","Documentary, War",Conservative political commentator Mark Taylor...,https://m.media-amazon.com/images/M/MV5BMjEyND...,Buried in the Sand: The Deception of America,2004,5dd24aeaa6edc0dcb0d75564,tt0436149,This stomach-churning expose puts an anguished...,http://www.imdb.com/title/tt0436149/reviews,5.1,negative,"[stomachchurning, expose, puts, anguished, fac..."
8,"Javier Cámara, Nathalie Poza, Eman Xor Oña, Le...",Drama,,https://m.media-amazon.com/images/M/MV5BMTI4Mz...,Malas temporadas,2005,5dd24aeaa6edc0dcb0d7556a,tt0456526,"The dazzling multistory ""Hard Times (Malas Tem...",http://www.imdb.com/title/tt0456526/reviews,6.5,positive,"[dazzling, multistory, hard, times, malas, tem..."
9,"Steve Austin, Mark Calaway, Dwayne Johnson, Pa...","Action, Sport",WWF Championship: Stone Cold Steve Austin vs. ...,https://m.media-amazon.com/images/M/MV5BNjI4NG...,Summerslam,1998,5dd24aeaa6edc0dcb0d7556e,tt0293944,This is one of my very favorite shows WWF/E ha...,http://www.imdb.com/title/tt0293944/reviews,7.5,positive,"[one, favorite, shows, wwfe, put, forgive, hav..."


In [34]:
# Clean up review text
word_filter = set(stopwords.words('english'))
def clean_sentence(s, word_filter):
    s = s.lower()
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = word_tokenize(s)
    s = [x for x in s if x not in word_filter] 
    return s

copynew['Parsed'] = [clean_sentence(s, word_filter) for s in copynew['reviews']]
copynew['Parsed']

0       [spoilers, spoilers, basis, movie, classic, no...
1       [rare, event, rate, movie, level, 10, could, w...
2       [first, half, twothirds, film, worked, well, w...
3       [thing, think, reading, negative, comments, le...
4       [finally, broadcast, turner, classic, movies, ...
5       [film, pretty, much, formula, gamera, film, fo...
6       [finally, saw, tv, several, years, ago, wish, ...
7       [stomachchurning, expose, puts, anguished, fac...
8       [dazzling, multistory, hard, times, malas, tem...
9       [one, favorite, shows, wwfe, put, forgive, hav...
10      [stoogeheads, havent, seen, short, seen, watch...
11      [success, airport, poseidon, adventure, toweri...
12      [jacob, wade, jack, palance, used, celebrated,...
13      [one, weaker, entries, drummond, series, still...
14      [talent, act, deveraux, groucho, novarro, carm...
15      [one, doesnt, showcase, ww, best, see, diploma...
16      [incredible, person, suggested, brother, watch...
17      [tomor

In [35]:
# Print new DF
copynew

Unnamed: 0,Actors,Genre,Plot,Poster,Title,Year,_id,movie_id,reviews,url,label_orig,Review_Type,Parsed
0,"Lavinia Wilson, Barnaby Metschurat, Matthias K...","Drama, Romance",,https://m.media-amazon.com/images/M/MV5BMGJhOD...,Julietta,2001,5dd24aeaa6edc0dcb0d75569,tt0248123,***SPOILERS*** ***SPOILERS*** The basis to t...,http://www.imdb.com/title/tt0248123/reviews,6.2,positive,"[spoilers, spoilers, basis, movie, classic, no..."
1,"Tôru Furuya, Mami Koyama, Keiko Han, Michiko N...","Animation, Drama, Family, Fantasy, Romance",Years after the war between the Children of Wa...,https://m.media-amazon.com/images/M/MV5BZGJmND...,Sea Prince and the Fire Child,1981,5dd24aeaa6edc0dcb0d75566,tt0203895,It is a rare event when I rate a movie on the ...,http://www.imdb.com/title/tt0203895/reviews,7.2,positive,"[rare, event, rate, movie, level, 10, could, w..."
2,"Lars Hanson, Marceline Day, Pauline Starke, Er...","Drama, Romance",A returning seminary student goes up against a...,https://m.media-amazon.com/images/M/MV5BMTc3OT...,Captain Salvation,1927,5dd24aeaa6edc0dcb0d75568,tt0017735,The first half to two-thirds of the film worke...,http://www.imdb.com/title/tt0017735/reviews,6.9,positive,"[first, half, twothirds, film, worked, well, w..."
3,"Toni Collette, Parker Posey, Lisa Kudrow, Alan...","Comedy, Drama",Iris can best be described as a wallflower. Sh...,https://m.media-amazon.com/images/M/MV5BYzZlND...,Clockwatchers,1997,5dd24aeaa6edc0dcb0d75563,tt0118866,The only thing that I can think when reading t...,http://www.imdb.com/title/tt0118866/reviews,6.5,positive,"[thing, think, reading, negative, comments, le..."
4,"William Boyd, Mary Astor, Louis Wolheim, Ian K...","Comedy, Romance, Adventure",Americans Sgt. Peter O'Gaffney and one of his ...,https://m.media-amazon.com/images/M/MV5BNTJiNG...,Two Arabian Knights,1927,5dd24aeaa6edc0dcb0d75562,tt0018515,Finally broadcast by Turner Classic Movies on ...,http://www.imdb.com/title/tt0018515/reviews,6.9,positive,"[finally, broadcast, turner, classic, movies, ..."
5,"Tsutomu Takakuwa, Kelly Varis, Katherine Murph...","Action, Adventure, Family, Sci-Fi",A giant creature attacks Japan during the Worl...,https://m.media-amazon.com/images/M/MV5BYjNiZj...,Gamera vs. Monster X,1970,5dd24aeaa6edc0dcb0d7556c,tt0065755,This film is pretty much a formula Gamera film...,http://www.imdb.com/title/tt0065755/reviews,5.6,negative,"[film, pretty, much, formula, gamera, film, fo..."
6,"William Prince, Jim Backus, Christine White, J...","Horror, Thriller",A small-town doctor (William Prince) gets caug...,https://m.media-amazon.com/images/M/MV5BZGJlN2...,Macabre,1958,5dd24aeaa6edc0dcb0d75567,tt0051885,I finally saw this on TV several years ago and...,http://www.imdb.com/title/tt0051885/reviews,5.8,negative,"[finally, saw, tv, several, years, ago, wish, ..."
7,"Mark Taylor, Ted Kennedy, Saddam Hussein, Geor...","Documentary, War",Conservative political commentator Mark Taylor...,https://m.media-amazon.com/images/M/MV5BMjEyND...,Buried in the Sand: The Deception of America,2004,5dd24aeaa6edc0dcb0d75564,tt0436149,This stomach-churning expose puts an anguished...,http://www.imdb.com/title/tt0436149/reviews,5.1,negative,"[stomachchurning, expose, puts, anguished, fac..."
8,"Javier Cámara, Nathalie Poza, Eman Xor Oña, Le...",Drama,,https://m.media-amazon.com/images/M/MV5BMTI4Mz...,Malas temporadas,2005,5dd24aeaa6edc0dcb0d7556a,tt0456526,"The dazzling multistory ""Hard Times (Malas Tem...",http://www.imdb.com/title/tt0456526/reviews,6.5,positive,"[dazzling, multistory, hard, times, malas, tem..."
9,"Steve Austin, Mark Calaway, Dwayne Johnson, Pa...","Action, Sport",WWF Championship: Stone Cold Steve Austin vs. ...,https://m.media-amazon.com/images/M/MV5BNjI4NG...,Summerslam,1998,5dd24aeaa6edc0dcb0d7556e,tt0293944,This is one of my very favorite shows WWF/E ha...,http://www.imdb.com/title/tt0293944/reviews,7.5,positive,"[one, favorite, shows, wwfe, put, forgive, hav..."


If you want to load into MongoDB, uncomment below

In [39]:
# # Create list of dictionaries in order to efficiently insert into MongoDB
# movies_dict = copynew.to_dict('records')

# # MongoDB connection
# conn = 'mongodb+srv://general_user:charli3s_ang3ls@cluster0-tyboh.mongodb.net/movie_db?retryWrites=true&w=majority'
# client = pymongo.MongoClient(conn)
# # Declare the collection
# collection = client.movie_db.movie_reviews
# #Drop collection if it exists to prevent duplication
# collection.drop()  
# # Insert all of the documents into the collection
# collection.insert_many(movies_dict)

<pymongo.results.InsertManyResult at 0x1a5a7b2f88>