## Sentiment Analysis for the IMDb Movie Reviews dataset     

### Using BOW technique based on bigrams

In [1]:
from nltk.corpus import movie_reviews
import re,string
from bs4 import BeautifulSoup
from random import shuffle 
from collections import defaultdict

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

from nltk import FreqDist

from nltk import classify, ngrams, NaiveBayesClassifier

from nltk.metrics.scores import precision, recall, f_measure

This corpus has 2000 reviews, with 1000 having the 'pos' label and the remaining 1000 having the 'neg' label.

In [2]:
print ("Total reviews:", len(movie_reviews.fileids())) 
print ("Review categories:", movie_reviews.categories()) 
print ("Total positive reviews:", len(movie_reviews.fileids('pos'))) 
print ("Total negative reviews:", len(movie_reviews.fileids('neg'))) 
positive_review_file = movie_reviews.fileids('neg')[1] 
print ("Sample review file:", positive_review_file) 
#print("\nContent of this sample review is :", movie_reviews.raw(positive_review_file))

Total reviews: 2000
Review categories: ['neg', 'pos']
Total positive reviews: 1000
Total negative reviews: 1000
Sample review file: neg/cv001_19502.txt


#### Preprocessing step

The method below accepts a movie review as an input argument and returns tokens as an output.<br>
Different parts of preprocessing step are: <br>
- removing html tags
- removing sentences between [] or () that were extra explanation
- removing punctuation marks
- removing stop words
- extracting lemma and replace the main words

In [3]:
stop_words = stopwords.words('english')
linking_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most',
                   'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too',
                   'very', 'just', 'but']

bigram_stop_words = set(stop_words) - set(linking_words)

In [4]:
def clean_text(text, stop_words):
    # removing the html strips
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # removing the square [] , {} , () , «»
    text = re.sub('\[[^]]*\]', ' ', text)
    text = re.sub('\([^]]*\)', ' ', text)
    text = re.sub('\«[^]]*\»', ' ', text)
    text = re.sub('\{[^]]*\}', ' ', text)
    tokens = word_tokenize(text)
    # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    # Lower the tokens
    tokens = [word.lower() for word in tokens]
    # Remove stopword
    tokens = [word for word in tokens if not word in stop_words]
    # Lemmatize
    #lemma = WordNetLemmatizer()
    #tokens = [lemma.lemmatize(word, pos = "v") for word in tokens]
    #tokens = [lemma.lemmatize(word, pos = "n") for word in tokens]
    return tokens

In [5]:
pos_reviews = []
bigram_pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    #extract unigram cleaned words
    fileid_words_clean = clean_text(movie_reviews.raw(fileid),stop_words)
    pos_reviews.append(fileid_words_clean)
    #extract bigram cleaned words
    fileid_words_clean = clean_text(movie_reviews.raw(fileid),bigram_stop_words)
    bigram_pos_reviews.append(fileid_words_clean)

neg_reviews = []
bigram_neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    #extract unigram cleaned words
    fileid_words_clean = clean_text(movie_reviews.raw(fileid),stop_words)
    neg_reviews.append(fileid_words_clean)
    #extract bigram cleaned words
    fileid_words_clean = clean_text(movie_reviews.raw(fileid),bigram_stop_words)
    bigram_neg_reviews.append(fileid_words_clean)


In [6]:
#print("Raw text of sample review:\n", (movie_reviews.raw(pos_review_fileids[0])))
print("\nBOW of sample review:\n", (pos_reviews[0])) 


BOW of sample review:
 ['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'superheroes', 'log', 'great', 'supporting', 'roles', 'big', 'surprise', 'graham', 'cringed', 'first', 'time', 'opened', 'mouth', 'imagining', 'attempt', 'irish', 'accent', 'actually', 'half', 'bad', 'film', 'however', 'good', 'r', 'strong', 'sexuality', 'language', 'drug', 'content']


#### Feature Extraction step

In this step, unigram tokensand bigram tokens, regardless of their places, are assuemd as features.<br>

In [7]:
def bag_of_words(words):
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [8]:
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = dict([word, True] for word in words_ng)    
    return words_dictionary

In [9]:
pos_reviews_set = []
for i in range(len(pos_reviews)):
    unigram_features = bag_of_words(pos_reviews[i])
    bigram_features = bag_of_ngrams(bigram_pos_reviews[i])
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
    pos_reviews_set.append((all_features, 'pos'))

neg_reviews_set = []
for i in range(len(neg_reviews)):
    unigram_features = bag_of_words(neg_reviews[i])
    bigram_features = bag_of_ngrams(bigram_neg_reviews[i])
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
    neg_reviews_set.append((all_features, 'neg'))    

In [10]:
#print ("\nSample positive review:\n",pos_reviews_set[0])
#print ("\nSample negative review:\n", neg_reviews_set[0])

#### Training step

The first 1600 words were selected for training, while the final 400 words were chosen for testing.

In [11]:
all_features = pos_reviews_set + neg_reviews_set
shuffle(all_features)

X_all = [list(item[0]) for item in all_features]
y_all = [item[1] for item in all_features]

X_train = X_all[:1600]
y_train = y_all[:1600]
X_test = X_all[1600:]
y_test = y_all[1600:]
print("Train Size:", len(X_train), "\nTest Size:", len(X_test))

Train Size: 1600 
Test Size: 400


A technique for dividing data into test and train sets.

In [12]:
def k_fold_generator(X, y, k_fold):
    subset_size = int(len(X) / k_fold)
    #print("subset_size:", subset_size)
    for k in range(k_fold):
        X_train = X[:k * subset_size] + X[(k + 1) * subset_size:]
        X_valid = X[k * subset_size:][:subset_size]
        y_train = y[:k * subset_size] + y[(k + 1) * subset_size:]
        y_valid = y[k * subset_size:][:subset_size]

        yield X_train, y_train, X_valid, y_valid

The NaiveBayes model was called five times for various portions of the data, with fold=5 being assumed as the input argument for the above technique. <br> In the end, the accuracy was determined by averaging them.

In [13]:
k_fold=5
counter=0
sumAccuracy = 0
for X_fold_train, y_fold_train, X_fold_valid, y_fold_valid in k_fold_generator(X_train, y_train, k_fold):
    #print(len(X_fold_train), len(y_fold_train), len(X_fold_valid), len(y_fold_valid))
    counter += 1
    print("\nIteration : ", counter)
    
    train_features = []
    for i in range(len(X_fold_train)):
        train_features.append((dict([word, True] for word in X_fold_train[i]), y_fold_train[i]))
        
    classifier = NaiveBayesClassifier.train(train_features) 
    classifier.train
    
    test_features = []
    for i in range(len(X_fold_valid)):
        test_features.append((dict([word, True] for word in X_fold_valid[i]), y_fold_valid[i]))  
            
    accuracy = classify.accuracy(classifier, test_features)
    sumAccuracy += accuracy
    print("Accuracy:", accuracy) 

print("\nAverage Accuracy : ", sumAccuracy/k_fold)


Iteration :  1
Accuracy: 0.80625

Iteration :  2
Accuracy: 0.796875

Iteration :  3
Accuracy: 0.734375

Iteration :  4
Accuracy: 0.815625

Iteration :  5
Accuracy: 0.790625

Average Accuracy :  0.7887500000000001


10 features with the most effect were chosen.

In [15]:
print (classifier.show_most_informative_features(10))

Most Informative Features
        ('one', 'worst') = True              neg : pos    =     17.3 : 1.0
             wonderfully = True              pos : neg    =     12.9 : 1.0
     ('batman', 'robin') = True              neg : pos    =     11.7 : 1.0
                ordinary = True              pos : neg    =     10.4 : 1.0
               redeeming = True              neg : pos    =      9.5 : 1.0
             outstanding = True              pos : neg    =      8.9 : 1.0
               fantastic = True              pos : neg    =      8.9 : 1.0
                  arnold = True              neg : pos    =      8.8 : 1.0
       ('waste', 'time') = True              neg : pos    =      8.7 : 1.0
                horribly = True              neg : pos    =      8.1 : 1.0
None


Model evaluation was done using test data.

In [16]:
test_features = []
for i in range(len(X_test)):
    test_features.append((dict([word, True] for word in X_test[i]), y_test[i]))
         
y_pred, gold_labels = defaultdict(set), defaultdict(set)
    
for i, (features, label) in enumerate(test_features):
    y_pred[classifier.classify(features)].add(i)
    gold_labels[label].add(i) 

for label in y_pred:
    print(label, 'Precision:', precision(gold_labels[label], y_pred[label]))
    print(label, 'Recall:', recall(gold_labels[label], y_pred[label]))
    print(label, 'F1-Score:', f_measure(gold_labels[label], y_pred[label]))
accuracy = classify.accuracy(classifier, test_features)
        
print("Accuracy:", accuracy) 

pos Precision: 0.7198275862068966
pos Recall: 0.8434343434343434
pos F1-Score: 0.7767441860465116
neg Precision: 0.8154761904761905
neg Recall: 0.6782178217821783
neg F1-Score: 0.7405405405405405
Accuracy: 0.76
