## Sentiment Analysis for the IMDb Movie Reviews dataset         


### Using top n frequent words as model features


In [1]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Roghi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [116]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Roghi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [39]:
from nltk.corpus import movie_reviews
import re,string
from bs4 import BeautifulSoup

from random import shuffle 
from collections import defaultdict

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

from nltk import FreqDist

from nltk import classify, NaiveBayesClassifier

from nltk.metrics.scores import precision, recall, f_measure

This corpus has 2000 reviews, with 1000 having the 'pos' label and the remaining 1000 having the 'neg' label.

In [40]:
print ("Total reviews:", len(movie_reviews.fileids())) 
print ("Review categories:", movie_reviews.categories()) 
print ("Total positive reviews:", len(movie_reviews.fileids('pos'))) 
print ("Total negative reviews:", len(movie_reviews.fileids('neg'))) 
positive_review_file = movie_reviews.fileids('neg')[1] 
print ("Sample review file:", positive_review_file) 
#print("\nContent of this sample review is :", movie_reviews.raw(positive_review_file))

Total reviews: 2000
Review categories: ['neg', 'pos']
Total positive reviews: 1000
Total negative reviews: 1000
Sample review file: neg/cv001_19502.txt


#### Preprocessing step


The method below accepts a movie review as an input argument and returns tokens as an output.<br>
Different parts of preprocessing step are: <br>
- removing html tags
- removing sentences between [] or () that were extra explanation
- removing punctuation marks
- removing stop words
- extracting lemma and replace the main words

In [41]:
def clean_text(text):
    # removing the html strips
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # removing the square [] , {} , () , «»
    text = re.sub('\[[^]]*\]', ' ', text)
    text = re.sub('\([^]]*\)', ' ', text)
    text = re.sub('\«[^]]*\»', ' ', text)
    text = re.sub('\{[^]]*\}', ' ', text)
    tokens = word_tokenize(text)
    # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    # Lower the tokens
    tokens = [word.lower() for word in tokens]
    # Remove stopword
    tokens = [word for word in tokens if not word in stopwords.words("english")]
    # Lemmatize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos = "v") for word in tokens]
    tokens = [lemma.lemmatize(word, pos = "n") for word in tokens]
    return tokens

#### Feature Extraction step

For each movie review, preprocessing was done and separated positive reviews and negative reviews.

In [42]:
reviews = []
all_pos_words_clean = []
all_neg_words_clean = []

num=1
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        if num<=2000:
            num+=1
            fileid_words_clean = clean_text(movie_reviews.raw(fileid))
            for word in fileid_words_clean:
                if category=='pos':
                    all_pos_words_clean.append(word)
                else:
                    all_neg_words_clean.append(word)
                
            reviews.append((fileid_words_clean, category))

print("Num of reviews:", len(reviews)) 
print("Num of all_pos_words_clean:", len(all_pos_words_clean)) 
print("Num of all_neg_words_clean:", len(all_neg_words_clean)) 

#print ("Sample raw review:\n", movie_reviews.raw(review_fileids[10]))
#print ("\nSample tokenized review after preprocessing:\n", reviews[10])

Num of reviews: 2000
Num of all_pos_words_clean: 156252
Num of all_neg_words_clean: 144619


In this section, words frequency was calculated for all positive and negative reviews.

In [43]:
all_pos_words_frequency = FreqDist(all_pos_words_clean)
all_neg_words_frequency = FreqDist(all_neg_words_clean)

print("Length of All pos words:", len(all_pos_words_clean))
print("Length of All pos unique words:", len(all_pos_words_frequency))
print("----------------------------------------------------")
print("Length of All neg words:", len(all_neg_words_clean))
print("Length of All neg unique words:", len(all_neg_words_frequency))
print("----------------------------------------------------")
print ("\nTop 20 most frequently occurring POS words:\n", all_pos_words_frequency.most_common(20))
print ("\nTop 20 most frequently occurring NEG words:\n", all_neg_words_frequency.most_common(20))

Length of All pos words: 156252
Length of All pos unique words: 14803
----------------------------------------------------
Length of All neg words: 144619
Length of All neg unique words: 13930
----------------------------------------------------

Top 20 most frequently occurring POS words:
 [('film', 3026), ('movie', 1828), ('one', 1395), ('make', 1031), ('character', 874), ('see', 864), ('like', 826), ('time', 713), ('get', 709), ('story', 651), ('go', 647), ('good', 587), ('scene', 550), ('even', 532), ('take', 511), ('play', 505), ('end', 495), ('would', 489), ('well', 479), ('come', 460)]

Top 20 most frequently occurring NEG words:
 [('film', 2656), ('movie', 2201), ('one', 1339), ('make', 1065), ('like', 969), ('get', 866), ('character', 822), ('even', 715), ('go', 706), ('see', 699), ('time', 679), ('would', 644), ('good', 595), ('story', 557), ('bad', 540), ('scene', 523), ('much', 500), ('look', 488), ('play', 485), ('could', 485)]


For each label, the most 500 frequent words, were extracted and removed from all the tokens.

In [44]:
num_freq = 500
pos_most_freq_words = []
neg_most_freq_words = []

for word_freq in all_pos_words_frequency.most_common(num_freq):
    pos_most_freq_words.append(word_freq[0])
    
for word_freq in all_neg_words_frequency.most_common(num_freq):
    neg_most_freq_words.append(word_freq[0])   
    

In [45]:
#pos_words = set(all_pos_words_clean).difference(set(all_neg_words_clean))
#neg_words = set(all_neg_words_clean).difference(set(all_pos_words_clean))
pos_words = [word for word in all_pos_words_clean if not word in neg_most_freq_words]
neg_words = [word for word in all_neg_words_clean if not word in pos_most_freq_words]

print(len(pos_words))
print(len(neg_words))

79143
71172


Once more, the top 1000 words for each label were retrieved and utilized as the models' features for the remaining tokens.

In [46]:
sel_pos_words_frequency = FreqDist(pos_words)
sel_neg_words_frequency = FreqDist(neg_words)

print("Length of All pos words:", len(pos_words))
print("Length of All pos unique words:", len(sel_pos_words_frequency))
print("----------------------------------------------------")
print("Length of All neg words:", len(neg_words))
print("Length of All neg unique words:", len(sel_neg_words_frequency))
print("----------------------------------------------------")
print ("\nTop 20 most frequently occurring POS words:\n", sel_pos_words_frequency.most_common(20))
print ("\nTop 20 most frequently occurring NEG words:\n", sel_neg_words_frequency.most_common(20))

Length of All pos words: 79143
Length of All pos unique words: 14303
----------------------------------------------------
Length of All neg words: 71172
Length of All neg unique words: 13430
----------------------------------------------------

Top 20 most frequently occurring POS words:
 [('mother', 104), ('joe', 103), ('beautiful', 102), ('fiction', 100), ('strong', 92), ('fine', 90), ('touch', 89), ('dream', 88), ('son', 88), ('flaw', 88), ('tale', 87), ('others', 85), ('person', 84), ('art', 84), ('remain', 82), ('ship', 81), ('voice', 81), ('science', 80), ('throughout', 79), ('image', 78)]

Top 20 most frequently occurring NEG words:
 [('bore', 167), ('worst', 143), ('waste', 139), ('unfortunately', 122), ('maybe', 118), ('suppose', 116), ('tv', 113), ('guess', 90), ('stupid', 88), ('career', 85), ('car', 84), ('worse', 84), ('batman', 83), ('wild', 80), ('mess', 79), ('cop', 77), ('terrible', 76), ('none', 76), ('obvious', 76), ('early', 74)]


In [47]:
num_frequently = 1000
most_common_words = sel_neg_words_frequency.most_common(num_frequently) + sel_pos_words_frequency.most_common(num_frequently)


In method below, tuples were created for these 2000 frequent words.

In [48]:
def review_features(review_words, word_features):    
    review_unique_words = set(review_words)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in review_unique_words)
    return features

In [49]:
word_features = [item[0] for item in most_common_words]

feature_set = [(review_features(doc, word_features), category) for (doc, category) in reviews]


In [50]:
print ("Sample document:", reviews[500])
print ("\nNum of features of Sample document:", len(reviews[500][0]))

sample_doc_new_features = review_features(reviews[500][0],word_features)
print("\nNum of new features:", len(sample_doc_new_features))
#print("\nSample document new features:\n", sample_doc_new_features)


Sample document: (['always', 'careful', 'first', 'official', 'studio', 'release', 'gate', 'year', 'obviously', 'film', 'studio', 'great', 'hop', 'however', 'month', 'picture', 'movie', 'like', 'firestorm', 'least', 'give', 'perspective', 'good', 'movie', 'really', 'fact', 'good', 'chance', 'still', 'play', 'near'], 'neg')

Num of features of Sample document: 31

Num of new features: 1411


#### Training step

The first 1600 words were selected for training, while the final 400 words were chosen for testing.

In [51]:
shuffle(feature_set)

X_all = [list(item[0]) for item in feature_set]
y_all = [item[1] for item in feature_set]

X_train = X_all[:1600]
y_train = y_all[:1600]
X_test = X_all[1600:]
y_test = y_all[1600:]
print("Train Size:", len(X_train), "\nTest Size:", len(X_test))

Train Size: 1600 
Test Size: 400


A technique for dividing data into test and train sets.

In [52]:
def k_fold_generator(X, y, k_fold):
    subset_size = int(len(X) / k_fold)
    #print("subset_size:", subset_size)
    for k in range(k_fold):
        X_train = X[:k * subset_size] + X[(k + 1) * subset_size:]
        X_valid = X[k * subset_size:][:subset_size]
        y_train = y[:k * subset_size] + y[(k + 1) * subset_size:]
        y_valid = y[k * subset_size:][:subset_size]

        yield X_train, y_train, X_valid, y_valid

The NaiveBayes model was called five times for various portions of the data, with fold=5 being assumed as the input argument for the above technique. <br> In the end, the accuracy was determined by averaging them.

In [54]:
k_fold=5
counter=0
sumAccuracy = 0
for X_fold_train, y_fold_train, X_fold_valid, y_fold_valid in k_fold_generator(X_train, y_train, k_fold):
    #print(len(X_fold_train), len(y_fold_train), len(X_fold_valid), len(y_fold_valid))
    counter += 1
    print("\nIteration : ", counter)
    
    train_features = []
    for i in range(len(X_fold_train)):
        train_features.append((dict([word, True] for word in X_fold_train[i]), y_fold_train[i]))
        
    classifier = NaiveBayesClassifier.train(train_features) 
    
    test_features = []
    for i in range(len(X_fold_valid)):
        test_features.append((dict([word, True] for word in X_fold_valid[i]), y_fold_valid[i]))  
            
    accuracy = classify.accuracy(classifier, test_features)
    sumAccuracy += accuracy
    print("Accuracy:", accuracy) 

print("\nAverage Accuracy : ", sumAccuracy/k_fold)


Iteration :  1
Accuracy: 0.478125

Iteration :  2
Accuracy: 0.496875

Iteration :  3
Accuracy: 0.44375

Iteration :  4
Accuracy: 0.4875

Iteration :  5
Accuracy: 0.503125

Average Accuracy :  0.48187500000000005


10 features with the most effect were chosen.

In [55]:
print (classifier.show_most_informative_features(10))

Most Informative Features
        contains(stupid) = True              neg : pos    =      1.0 : 1.0
           contains(buy) = True              neg : pos    =      1.0 : 1.0
           contains(gay) = True              neg : pos    =      1.0 : 1.0
        contains(affair) = True              neg : pos    =      1.0 : 1.0
          contains(slow) = True              neg : pos    =      1.0 : 1.0
           contains(fox) = True              neg : pos    =      1.0 : 1.0
          contains(band) = True              neg : pos    =      1.0 : 1.0
           contains(joe) = True              neg : pos    =      1.0 : 1.0
        contains(gibson) = True              neg : pos    =      1.0 : 1.0
          contains(shin) = True              neg : pos    =      1.0 : 1.0
None


Model evaluation was done using test data.

In [61]:
test_features = []
for i in range(len(X_test)):
    test_features.append((dict([word, True] for word in X_test[i]), y_test[i]))
         
accuracy = classify.accuracy(classifier, test_features)
        
print("Accuracy:", accuracy) 

Accuracy: 0.4825
