<img src="https://drive.google.com/uc?export=view&id=1PfklOc_ZVdRvAuIQhL3E25EYIDVizjre">


# Importing Movie Review Dataset from NLTK.
Sentiment Polarity Dataset Version 2.0
Bo Pang and Lillian Lee

http://www.cs.cornell.edu/people/pabo/movie-review-data/

<b>Citation Info</b><br> 
Bo Pang and Lillian Lee. 2004. A Sentimental Education: Sentiment Analysis 
Using Subjectivity Summarization Based on Minimum Cuts. In ACL.


In [1]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


# Dataset Analysis

In [2]:
#types of reviews
print("Category of Reviews")
print(movie_reviews.categories())
print("-----------------------")
print("Total Words in Dataset")
#Number of words
print(len(movie_reviews.words()))

Category of Reviews
['neg', 'pos']
-----------------------
Total Words in Dataset
1583820


In [3]:
#Number of Reviews
print("Total Reviews in Dataset")
print(len(movie_reviews.fileids()))
print("---------------------------")
#Postive and Negative Reviews
print("No. of review of each Type")
print(len(movie_reviews.fileids("pos")),len(movie_reviews.fileids("neg")))

Total Reviews in Dataset
2000
---------------------------
No. of review of each Type
1000 1000


# Data Preprocessing<br>
<img src="https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1535125878/NLTK3_zwbdgg.png">

In [4]:
"""
documents is list of tuples containing review and its category
"""
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
         documents.append((movie_reviews.words(fileid),category))
print("Document 0 with its category stored in a tuple\n")
print(documents[0])
"""
Lets shuffle the reviews to have a better split in training and testing data
"""
import random 
random.shuffle(documents)

Document 0 with its category stored in a tuple

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')


In [5]:
"""
We will store our Review and category in two different lists
"""
category = [category for document,category in documents]
reviews = [" ".join(document)for document,category in documents]
print(type(category),len(category))
print(type(reviews),len(reviews))

<class 'list'> 2000
<class 'list'> 2000


# Data Cleaning

In [149]:
"""
Utility function to clean the review. We Also considered punctuation marks as Stop words
"""
lemmatizer = WordNetLemmatizer()
stops = stopwords.words('english')
punc = list(string.punctuation)
stops = stops + punc

"""
Extracting Part of Speech (POS) for lemmatizer to work
"""
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
"""
Function to clean the review by removing stop words , lemmatizing and converting to lower cases
"""
counter=0
def clean_review(words):
    words = word_tokenize(words)
    output_words = []
    for w in words:
        if w.lower() not in stops:
            """
            we will not convert our word to lower case 
            before lammetization. This is because it change the 
            pos of the word.
            After we are done with lemmatization we will convert into 
            lower and append to our output_words[]
            """
            """
            pos_tag() requires an array , if we simply
            pass pass pos_tag(word) it will return pos 
            for each character of word
            So correct way is pos_tag([word])
            """
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [150]:
import time 
begin = time.time()
clean_reviews = [clean_review(review) for review in reviews]
time.sleep(1)
end = time.time()
print("##########")
print(f"Total runtime of the Data cleaning is {end - begin}")
print("##########")

##########
Total runtime of the Data cleaning is 1224.8879384994507
##########


In [46]:
print("Words in Review 1 before cleaning")
print(len(reviews[0]))
print("----------------------------------")
print("Words in Review 1 after cleaning")
print(len(clean_reviews[0]))

Words in Review 1 before cleaning
2355
----------------------------------
Words in Review 1 after cleaning
213


# Data Preparation for Count Vectorizer and TF-IDF Vectorizer

In [93]:
"""
Count Vectorizer take input in the form of string So we convert this.
"""

clean_reviews_cv = [" ".join(review) for review in clean_reviews]
total_features = [len(review) for review in clean_reviews_cv]
print("Total Words in the whole document after cleaning")
sum(total_features)

Total Words in the whole document after cleaning


4798259

#### Train Test Split

In [109]:
X_train, X_test, y_train, y_test = train_test_split(clean_reviews_cv,category, test_size=0.33, random_state=42)
print("SHAPE OF TRAINING DATA")
print(len(X_train),len(y_train))
print("-------------------------")
print("SHAPE OF TEST DATA")
print(len(X_test),len(y_test))

SHAPE OF TRAINING DATA
1340 1340
-------------------------
SHAPE OF TEST DATA
660 660


# Count Vectorizer

In [110]:
""" 
Count Vectorizer on Train data
"""
count_vec = CountVectorizer(max_features=6000,lowercase=True)
x_train_cv = count_vec.fit_transform(X_train)
print(x_train_cv.todense())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 6 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [111]:
"""
Count Vectorizer on Test Data
"""
x_test_cv = count_vec.transform(X_test)
print(x_test_cv.todense())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


# Tf-idf Vectorizer<br>
<img src="https://www.romainberg.com/wp-content/uploads/TF_IDF-final-1024x399.png">

In [112]:
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(X_train)
x_train_tfidf.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.03594997, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [113]:
x_test_tfidf = vectorizer.transform(X_test)
print(x_test_cv.todense())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


# Applying relevant model

- <font color="red"> Support Vector Machine
- <font color="red"> Logistic Regression
- <font color="red"> Naive Bayes

In [116]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [141]:
"""
SVM
"""
svc = SVC()
svc.fit(x_train_cv,y_train)

"""
Logistic Regression
"""
lr = LogisticRegression(max_iter=100000)
lr.fit(x_train_cv,y_train)

"""
Naive Bayes
"""
nb = MultinomialNB()
nb.fit(x_train_cv,y_train)

MultinomialNB()

In [145]:
y_pred_svc = svc.predict(x_test_cv)
y_pred_lr = lr.predict(x_test_cv)
y_pred_nb = nb.predict(x_test_cv)

# Model Evaluation and Confusion Matrix

## SVM

In [146]:
print("F1 score for SVM model")
print(f1_score(y_test,y_pred_svc,average="binary",pos_label="neg"))
print("--------------------")
print("CONFUSION MATRIX")
print(confusion_matrix(y_test,y_pred_svc))

F1 score for SVM model
0.8102409638554215
--------------------
CONFUSION MATRIX
[[269  57]
 [ 69 265]]


## Logistic Regression

In [147]:
print("F1 score for Logistic Regression model")
print(f1_score(y_test,y_pred_lr,average="binary",pos_label="neg"))
print("--------------------")
print("CONFUSION MATRIX")
print(confusion_matrix(y_test,y_pred_lr))

F1 score for Logistic Regression model
0.8421052631578947
--------------------
CONFUSION MATRIX
[[272  54]
 [ 48 286]]


## Naive Bayes

In [148]:
print("F1 score for Naive Bayes model")
print(f1_score(y_test,y_pred_nb,average="binary",pos_label="neg"))
print("--------------------")
print("CONFUSION MATRIX")
print(confusion_matrix(y_test,y_pred_nb))

F1 score for Naive Bayes model
0.8320726172465961
--------------------
CONFUSION MATRIX
[[275  51]
 [ 60 274]]


# <font color="red"> Logistic Regression is giving Highest F1 score 