<h1>Predicting whether a movie review is Negative or Positive</h1>

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('moviereviews.tsv', sep='\t')

In [2]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
# checking for NaN values
df.isnull().sum()

label      0
review    35
dtype: int64

In [4]:
# removing the NaN values
df.dropna(inplace=True)

In [5]:
# detecting empty strings
blanks = []

for index,label,review in df.itertuples():
    if type(review)==str:
        if review.isspace():
            blanks.append(index)

print(len(blanks),'Blanks:\n',blanks)

27 Blanks:
 [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [6]:
# removing empty strings from dataset
df.drop(blanks, inplace=True)

In [7]:
# splitting data into train & test sets

from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

In [8]:
#function to print out the scores of the pipelines
def print_scores(y_test, predictions):
    
    #printing confusion matrix
    print('Confusion Matrix'.center(30,'-'))
    print(metrics.confusion_matrix(y_test,predictions))
    
    #printing classification report
    print('Classification Report'.center(30,'-'))
    print(metrics.classification_report(y_test,predictions))
    
    #printing accuracy score
    print('Accuracy Score: ',metrics.accuracy_score(y_test,predictions),'\n')

In [9]:
# building pipelines to vectorize the data, and comparing their scores
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

# ---- Naive Bayes pipeline ----
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', MultinomialNB())])
text_clf_nb.fit(X_train, y_train)

# forming a prediction set
predictions_nb = text_clf_nb.predict(X_test)

# printing Naive Bayes scores
print('Naive Bayes pipeline'.center(55,'-'),'\n')
print_scores(y_test,predictions_nb)

# ----LinearSVC pipeline----
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', LinearSVC())])
text_clf_lsvc.fit(X_train, y_train)

# forming a prediction set
predictions_lsvc = text_clf_lsvc.predict(X_test)

# printing out LSVC scores
print('LinearSVC pipeline'.center(55,'-'),'\n')
print_scores(y_test,predictions_lsvc)

------------------Naive Bayes pipeline----------------- 

-------Confusion Matrix-------
[[280  40]
 [ 82 238]]
----Classification Report-----
              precision    recall  f1-score   support

         neg       0.77      0.88      0.82       320
         pos       0.86      0.74      0.80       320

    accuracy                           0.81       640
   macro avg       0.81      0.81      0.81       640
weighted avg       0.81      0.81      0.81       640

Accuracy Score:  0.809375 

-------------------LinearSVC pipeline------------------ 

-------Confusion Matrix-------
[[270  50]
 [ 49 271]]
----Classification Report-----
              precision    recall  f1-score   support

         neg       0.85      0.84      0.85       320
         pos       0.84      0.85      0.85       320

    accuracy                           0.85       640
   macro avg       0.85      0.85      0.85       640
weighted avg       0.85      0.85      0.85       640

Accuracy Score:  0.8453125 



This next portion uses VADER sentiment analysis instead

In [10]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

# using sid to attach compound sores to dataset
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

# printing out scores
print_scores(df['label'],df['comp_score'])

-------Confusion Matrix-------
[[427 542]
 [164 805]]
----Classification Report-----
              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938

Accuracy Score:  0.6357069143446853 



In [11]:
# testing out my own movie review

myreview = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

print('Naive Bayes:',text_clf_nb.predict([myreview]))

print('LinearSVC:' ,text_clf_lsvc.predict([myreview]))

if sid.polarity_scores(myreview)['compound'] > 0:
    print('NLTK VADER: Positive')
elif sid.polarity_scores(myreview)['compound'] < 0:
    print('NLTK VADER: Negative')
else:
    print('Neutral')

Naive Bayes: ['neg']
LinearSVC: ['neg']
NLTK VADER: Positive
