1. Choose a Netflix movie of interest.

2. Download the comments / find a movie comment file.

3. Import libraries for sentiment classification of the comments of your favorite movie ( positive/ negative / neutral ).

4. Submit your compiled code.

In [1]:
import os
import nltk
import pandas as pd
from nltk.corpus import movie_reviews

import warnings
warnings.filterwarnings('ignore')

In [2]:
#nltk.download('movie_reviews')

In [3]:
#df = pd.read_csv('https://raw.githubusercontent.com/samendoza47/hello-world/master/IMDBDataset.csv?token=GHSAT0AAAAAABVWQ5YXERT77KRNO6IEJJZ2YVP4M2A')

import numpy as np


url = 'https://drive.google.com/file/d/1kvoXXSuCz3LfyX9SpBlsZs-5nDO5Eypz/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)

df.head()


df_reviews = df[df.index < 3000]
df_reviews.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [4]:
# Rename Values
df_reviews['sentiment'] = df_reviews['sentiment'].replace(['positive', 'negative'], ['pos', 'neg'])
df_reviews.head(3)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,pos
1,A wonderful little production. <br /><br />The...,pos
2,I thought this was a wonderful way to spend ti...,pos


In [5]:
#movie_reviews.words()
df_reviews['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [6]:
import string

text = " ".join(df_reviews['review'])
text_filtered = text.translate(str.maketrans('','',string.punctuation))

In [7]:
from nltk import word_tokenize
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
tokens = word_tokenize(text_filtered)
word_filtered = [w.lower() for w in tokens if w not in stopwords]


In [8]:
counter_dict = nltk.FreqDist(word_filtered)
print(counter_dict.most_common(15))

[('i', 8576), ('br', 6941), ('the', 5429), ('movie', 5057), ('film', 4500), ('one', 2927), ('like', 2264), ('this', 1888), ('good', 1725), ('even', 1496), ('would', 1476), ('it', 1450), ('see', 1415), ('story', 1353), ('time', 1334)]


In [9]:
docs = []
for i in range(len(df_reviews)):
    token_word = word_tokenize(df_reviews['review'][i])
    docs += [(token_word, df_reviews['sentiment'][i])]



In [10]:
# Create a list containing 3000 most frequent words
word_features = [w[0] for w in counter_dict.most_common(3000)]

In [11]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [12]:
search_features(docs[0][0])

{'i': False,
 'br': True,
 'the': True,
 'movie': False,
 'film': False,
 'one': False,
 'like': False,
 'this': True,
 'good': False,
 'even': False,
 'would': True,
 'it': True,
 'see': False,
 'story': False,
 'time': False,
 'really': False,
 'much': False,
 'well': True,
 'get': True,
 'bad': False,
 'also': False,
 'first': True,
 'people': False,
 'great': False,
 'dont': False,
 'movies': False,
 'way': False,
 'made': False,
 'films': False,
 'make': False,
 'characters': False,
 'could': True,
 'and': True,
 'think': False,
 'watch': False,
 'but': True,
 'many': True,
 'its': True,
 'little': False,
 'plot': False,
 'never': True,
 'two': False,
 'character': False,
 'seen': False,
 'a': True,
 'know': False,
 'acting': False,
 'in': True,
 'love': False,
 'life': False,
 'if': True,
 'best': False,
 'ever': True,
 'show': True,
 'better': False,
 'say': True,
 'still': False,
 'scene': False,
 'man': False,
 'there': False,
 'scenes': True,
 'go': False,
 'something': False

In [13]:
## Applying the function to all the reviews.
feature_set = [(search_features(doc), sentiment) for (doc, sentiment) in docs]

In [14]:
len(feature_set)

3000

In [15]:
feature_set[12]

({'i': False,
  'br': True,
  'the': True,
  'movie': True,
  'film': True,
  'one': True,
  'like': True,
  'this': True,
  'good': True,
  'even': True,
  'would': True,
  'it': True,
  'see': True,
  'story': False,
  'time': False,
  'really': True,
  'much': False,
  'well': False,
  'get': False,
  'bad': False,
  'also': False,
  'first': True,
  'people': False,
  'great': False,
  'dont': False,
  'movies': False,
  'way': False,
  'made': True,
  'films': False,
  'make': False,
  'characters': False,
  'could': False,
  'and': True,
  'think': False,
  'watch': False,
  'but': True,
  'many': True,
  'its': False,
  'little': False,
  'plot': False,
  'never': False,
  'two': False,
  'character': False,
  'seen': False,
  'a': True,
  'know': True,
  'acting': True,
  'in': True,
  'love': False,
  'life': False,
  'if': True,
  'best': False,
  'ever': False,
  'show': False,
  'better': False,
  'say': False,
  'still': False,
  'scene': True,
  'man': False,
  'there': F

In [16]:
training_set = feature_set[:1500]
testing_set = feature_set[1500:]

In [17]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [18]:
print("classifier's accuracy is: {}".format(nltk.classify.accuracy(classifier, testing_set))*100)

classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146666666666667classifier's accuracy is: 0.8146

In [19]:
classifier.show_most_informative_features(10)

Most Informative Features
                   waste = True              neg : pos    =     22.3 : 1.0
                  wasted = True              neg : pos    =     12.3 : 1.0
                    seat = True              pos : neg    =     11.8 : 1.0
               laughable = True              neg : pos    =     11.5 : 1.0
                stunning = True              pos : neg    =     11.2 : 1.0
                   awful = True              neg : pos    =      9.9 : 1.0
                horrible = True              neg : pos    =      9.6 : 1.0
             brilliantly = True              pos : neg    =      9.2 : 1.0
             emotionally = True              pos : neg    =      9.2 : 1.0
                identity = True              pos : neg    =      9.2 : 1.0


# Saving the model

In [20]:
import pickle
save_classifier = open("naive_bayes_model.pkl","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

# Loading the model

In [21]:
classifier_f = open('naive_bayes_model.pkl','rb')
classifier = pickle.load(classifier_f)
#classifier_f.close()

# Testing the model

In [22]:
#custom_review = 'I hated the restaurant. it was a disaster eating there, Poor service, arrogant waiters'

custom_review = '''I had a great time eating delicious food at your restaurant. 
                the waiters and waitresses were committed to great service and were very friendly'''

In [23]:
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = search_features(custom_review_tokens)
print(classifier.classify(custom_review_set))


pos


In [24]:
prob_result = (classifier.prob_classify(custom_review_set))

print(prob_result.max())
print(prob_result.prob('pos'))
print(prob_result.prob('neg'))

pos
0.9310158275632736
0.06898417243672267


In [25]:
classifier_f.close()