In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [24]:
movies = pd.read_csv('movie_overviews.csv')
movies

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
...,...,...,...,...
9094,159550,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,
9095,392572,Rustom,"Rustom Pavri, an honourable officer of the Ind...",Decorated Officer. Devoted Family Man. Defendi...
9096,402672,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",
9097,315011,Shin Godzilla,From the mind behind Evangelion comes a hit la...,A god incarnate. A city doomed.


In [25]:
df = movies[['tagline']]
df

Unnamed: 0,tagline
0,
1,Roll the dice and unleash the excitement!
2,Still Yelling. Still Fighting. Still Ready for...
3,Friends are the people who let you be yourself...
4,Just When His World Is Back To Normal... He's ...
...,...
9094,
9095,Decorated Officer. Devoted Family Man. Defendi...
9096,
9097,A god incarnate. A city doomed.


In [16]:
df_1 = df.dropna(how='any')
df_1

Unnamed: 0,tagline
1,Roll the dice and unleash the excitement!
2,Still Yelling. Still Fighting. Still Ready for...
3,Friends are the people who let you be yourself...
4,Just When His World Is Back To Normal... He's ...
5,A Los Angeles Crime Saga
...,...
9091,Kingsglaive: Final Fantasy XV
9093,"What happens in Vegas, stays in Vegas. Unless ..."
9095,Decorated Officer. Devoted Family Man. Defendi...
9097,A god incarnate. A city doomed.


In [18]:
corpus = df_1['tagline']
corpus

1               Roll the dice and unleash the excitement!
2       Still Yelling. Still Fighting. Still Ready for...
3       Friends are the people who let you be yourself...
4       Just When His World Is Back To Normal... He's ...
5                                A Los Angeles Crime Saga
                              ...                        
9091                        Kingsglaive: Final Fantasy XV
9093    What happens in Vegas, stays in Vegas. Unless ...
9095    Decorated Officer. Devoted Family Man. Defendi...
9097                      A god incarnate. A city doomed.
9098              The band you know. The story you don't.
Name: tagline, Length: 7033, dtype: object

In [5]:
reviews = pd.read_csv('movie_reviews_clean.csv')
reviews

Unnamed: 0,review,sentiment
0,this anime series starts out great interesting...,0
1,some may go for a film like this but i most as...,0
2,i ve seen this piece of perfection during the ...,1
3,this movie is likely the worst movie i ve ever...,0
4,it ll soon be 10 yrs since this movie was rele...,1
...,...,...
995,this movie turned out to be pretty much what i...,1
996,from time to time it s very advisable for the ...,0
997,ed wood is eclipsed and becomes orson welles t...,0
998,well here we have yet another role reversal mo...,0


In [19]:
vectorizer = CountVectorizer(stop_words='english')

In [22]:
bow_matrix = vectorizer.fit_transform(corpus)
bow_matrix

<7033x6358 sparse matrix of type '<class 'numpy.int64'>'
	with 28474 stored elements in Compressed Sparse Row format>

In [21]:
print(bow_matrix.shape)

(7033, 6358)


In [26]:
# Sentiment analysis for movie reviews

In [27]:
X = reviews['review']
y = reviews['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english')

In [33]:
X_train_bow = vectorizer.fit_transform(X_train)
print(X_train_bow.shape)

(800, 15772)


In [34]:
X_test_bow = vectorizer.transform(X_test)
print(X_test_bow.shape)

(200, 15772)


In [36]:
clf = MultinomialNB()

In [37]:
clf.fit(X_train_bow, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is {}".format(accuracy))

The accuracy of the classifier on the test set is 0.765


In [47]:
# predict the sentiment of a negative review

In [48]:
review = "The movie was terrible. The music was underwhelming and the acting mediocre."

In [51]:
prediction = clf.predict(vectorizer.transform([review]))[0]
print(f'The sentiment predicted by the classifier is: {prediction}')

The sentiment predicted by the classifier is: 0


In [53]:
# Using movie tagline corpus

In [59]:
# genrating n-grams upto 1
vectorizer_ng_1 = CountVectorizer(ngram_range=(1,1))
ng_1 = vectorizer_ng_1.fit_transform(corpus)

In [60]:
# n-grams uptp 2
vectorizer_ng_2 = CountVectorizer(ngram_range=(1,2))
ng_2 = vectorizer_ng_2.fit_transform(corpus)

In [61]:
vectorizer_ng_3 = CountVectorizer(ngram_range=(1,3))
ng_3 = vectorizer_ng_3.fit_transform(corpus)

In [62]:
print("ng1, ng2 and ng3 have %i, %i and %i features respectively" %(ng_1.shape[1], ng_2.shape[1], ng_3.shape[1]))

ng1, ng2 and ng3 have 6614, 37100 and 76881 features respectively


In [63]:
# using higher n_gram model for sentiment analysis

In [68]:
X_train_ng = vectorizer_ng_3.fit_transform(X_train)
print(X_train_ng.shape)

(800, 271459)


In [66]:
X_test_ng = vectorizer_ng_3.transform(X_test)

In [69]:
print(X_test_ng.shape)

(200, 271459)


In [70]:
clf_ng = MultinomialNB()
clf_ng.fit(X_train_ng, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [73]:
accuracy_ng = clf_ng.score(X_test_ng, y_test)
print("The accuracy of the classifier on the test set is %.3f" %accuracy)

The accuracy of the classifier on the test set is 0.765


In [76]:
review_1 = "The movie was not good. The plot had several holes and the acting lacked panache."

In [77]:
prediction = clf_ng.predict(vectorizer_ng_3.transform([review_1]))[0]

In [78]:
print("The sentiment predicted by the classifier is %i" %(prediction))

The sentiment predicted by the classifier is 0


In [80]:
# Comparing the performance of different n-gram models: time and accuracy

In [83]:
start_time = time.time()

In [84]:
vectorizer_1 = CountVectorizer()
X_train_1 = vectorizer_1.fit_transform(X_train)
X_test_1 = vectorizer_1.transform(X_test)

In [85]:
clf = MultinomialNB()

In [86]:
clf.fit(X_train_1, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [92]:
print(f"The classifier took {time.time()-start_time} seconds to complete. The accuracy of the test set is {clf.score(X_test_1, y_test)}. The ngram representation had {X_train_1.shape[1]} features")

The classifier took 479.3252308368683 seconds to complete. The accuracy of the test set is 0.77. The ngram representation had 16067 features


In [93]:
# ngram upto 3

In [96]:
vectorizer_3 = CountVectorizer(ngram_range=(1,3))

In [98]:
X_train_3 = vectorizer_3.fit_transform(X_train)
X_test_3 = vectorizer_3.transform(X_test)

In [99]:
clf.fit(X_train_3, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [100]:
print(f"The classifier took {time.time()-start_time} seconds to complete. The accuracy of the test set is {clf.score(X_test_3, y_test)}. The ngram representation had {X_train_3.shape[1]} features")

The classifier took 952.6780827045441 seconds to complete. The accuracy of the test set is 0.78. The ngram representation had 271459 features
