## Sentiment Analysis for the IMDb Movie Reviews dataset     

### Using BoW (Bag-of-Words) technique based on ngrams

In [27]:
from nltk.corpus import movie_reviews, stopwords
import re,string
from bs4 import BeautifulSoup
from random import shuffle 
from collections import defaultdict

from nltk import sent_tokenize, word_tokenize
from nltk import WordNetLemmatizer

from nltk import classify, ngrams
from nltk import NaiveBayesClassifier
 
from nltk.metrics.scores import precision, recall, f_measure

This corpus has 2000 reviews, with 1000 having the 'pos' label and the remaining 1000 having the 'neg' label.

In [28]:
print ("Total reviews:", len(movie_reviews.fileids())) 
print ("Review categories:", movie_reviews.categories()) 
print ("Total positive reviews:", len(movie_reviews.fileids('pos'))) 
print ("Total negative reviews:", len(movie_reviews.fileids('neg'))) 
positive_review_file = movie_reviews.fileids('neg')[1] 
#print ("Sample review file:", positive_review_file) 
#print("\nContent of this sample review is :", movie_reviews.raw(positive_review_file))

Total reviews: 2000
Review categories: ['neg', 'pos']
Total positive reviews: 1000
Total negative reviews: 1000


#### Preprocessing step

The method below accepts a movie review as an input argument and returns tokens as an output.<br>
Different parts of preprocessing step are: <br>
- removing html tags
- removing sentences between [] or () that were extra explanation
- removing punctuation marks
- removing stop words
- extracting lemma and replace the main words

In [29]:
def clean_text(text):
    # removing the html strips
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # removing the square [] , {} , () , «»
    text = re.sub('\[[^]]*\]', ' ', text)
    text = re.sub('\([^]]*\)', ' ', text)
    text = re.sub('\«[^]]*\»', ' ', text)
    text = re.sub('\{[^]]*\}', ' ', text)
    
    tokens = word_tokenize(text)
    # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    # Lower the tokens
    tokens = [word.lower() for word in tokens]
    return tokens

#### Feature Extraction step

In this step, unigram tokensand bigram tokens, regardless of their places, are assuemd as features.<br>

In [30]:
def generate_ngrams(words, ngram_words, forward = True):
    new_text = []
    index = 0
    if not forward:
        words = list(reversed(words))
    while index < len(words):
        [new_word, new_index] = concatenate_words(index, words, ngram_words, forward)
        new_text.append(new_word)
        index = new_index+1 if index != new_index else index+1
    if not forward:
        return list(reversed(new_text))
    return new_text
 
def concatenate_words(index, text, ngram_words, forward):
    words = text[index]
    if index == len(text)-1:
        return words, index
    if words.split(' ')[0] in ngram_words:
        [new_word, new_index] = concatenate_words(index+1, text, ngram_words, forward)
        if forward:
            words = words + ' ' + new_word
        else:
            words = new_word + ' ' + words
        index = new_index
    return words, index

In [31]:
linking_words = ['and', 'any', 'anyone', 'anything', 'are', 'be', 'best', 'can', 'cannot', 'cant',
                 "can't", 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'done',
                 "don't", 'either', 'else', 'even', 'every', 'for', 'from', 'have', "haven't",
                 "he's", 'is', "isn't", 'it', 'its', "i've", 'just', 'like', 'lots', 'many', 'maybe',
                 'me', 'might', 'more', 'must', 'my', 'never', 'no', 'none', 'not', 'nothing', 'now',
                 'of', 'on', 'once', 'one', 'only', 'or', 'overly', 'perfectly', 'perhaps',
                 'probably', 'seemed', 'seems', "she's", 'should', 'simply', 'so', 'some',
                 'somehow', 'something', 'soon', 'start', 'takes', 'tell', 'thank', "that's",
                 'the', 'their', 'them', 'then', 'there', "there's", 'they', "they're", 'this',
                 'those', 'to', 'too', 'totally', 'tried', 'truly', 'try', 'turns', 'until',
                 'upon', 'use', 'very', 'wait', 'was', 'well', 'went', 'were', 'whether', 'which',
                 'whole', 'why', 'will', 'wish', "won't", 'would',"wouldn't", 'you', "you'll",
                 'your', "you're", 'yourself']

In [32]:
print(generate_ngrams(['film', 'wasn\'t', 'good'], linking_words))
print(generate_ngrams(['film', 'wasn\'t', 'good'], linking_words, False))
print(generate_ngrams(['film', 'was', 'so','good','and','interesting'], linking_words))
print(generate_ngrams(['film', 'was', 'so','good','and','interesting'], linking_words,False))

['film', "wasn't", 'good']
['film', "wasn't", 'good']
['film', 'was so good', 'and interesting']
['film was so', 'good and', 'interesting']


In [33]:
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    fileid_words_clean = clean_text(movie_reviews.raw(fileid))
    pos_reviews.append(fileid_words_clean)

neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    fileid_words_clean = clean_text(movie_reviews.raw(fileid))
    neg_reviews.append(fileid_words_clean)

In [34]:
pos_features = []
for words in pos_reviews:
    pos_features.append((dict([word, True] for word in words), 'pos'))
    pos_features.append((dict([word, True] for word in generate_ngrams(words, linking_words)), 'pos'))
    pos_features.append((dict([word, True] for word in generate_ngrams(words, linking_words, forward = False)), 'pos'))

neg_features = []
for words in neg_reviews:
    neg_features.append((dict([word, True] for word in words), 'neg'))
    neg_features.append((dict([word, True] for word in generate_ngrams(words, linking_words)), 'neg'))
    neg_features.append((dict([word, True] for word in generate_ngrams(words, linking_words, forward = False)), 'neg'))


#### Training step

The first 1600 words were selected for training, while the final 400 words were chosen for testing.

In [36]:
all_features = pos_features + neg_features
shuffle(all_features)

X_all = [list(item[0]) for item in all_features]
y_all = [item[1] for item in all_features]

X_train = X_all[:5400]
y_train = y_all[:5400]
X_test = X_all[5400:]
y_test = y_all[5400:]

#X_train = X_all[:3600]
#y_train = y_all[:3600]
#X_test = X_all[3600:]
#y_test = y_all[3600:]
print(len(X_train), len(X_test))

5400 600


A technique for dividing data into test and train sets.

In [37]:
def k_fold_generator(X, y, k_fold):
    subset_size = int(len(X) / k_fold)
    #print("subset_size:", subset_size)
    for k in range(k_fold):
        X_train = X[:k * subset_size] + X[(k + 1) * subset_size:]
        X_valid = X[k * subset_size:][:subset_size]
        y_train = y[:k * subset_size] + y[(k + 1) * subset_size:]
        y_valid = y[k * subset_size:][:subset_size]

        yield X_train, y_train, X_valid, y_valid

The NaiveBayes model was called five times for various portions of the data, with fold=5 being assumed as the input argument for the above technique. <br> In the end, the accuracy was determined by averaging them.

In [38]:
k_fold=5
counter=0
sumAccuracy = 0
for X_fold_train, y_fold_train, X_fold_valid, y_fold_valid in k_fold_generator(X_train, y_train, k_fold):
    #print(len(X_fold_train), len(y_fold_train), len(X_fold_valid), len(y_fold_valid))
    counter += 1
    print("\nIteration : ", counter)
    
    train_features = []
    for i in range(len(X_fold_train)):
        train_features.append((dict([word, True] for word in X_fold_train[i]), y_fold_train[i]))
        
    classifier = NaiveBayesClassifier.train(train_features) 
    classifier.train
    
    test_features = []
    for i in range(len(X_fold_valid)):
        test_features.append((dict([word, True] for word in X_fold_valid[i]), y_fold_valid[i]))  
            
    accuracy = classify.accuracy(classifier, test_features)
    sumAccuracy += accuracy
    print("Accuracy:", accuracy) 

print("\nAverage Accuracy : ", sumAccuracy/k_fold)


Iteration :  1
Accuracy: 0.9277777777777778

Iteration :  2
Accuracy: 0.9287037037037037

Iteration :  3
Accuracy: 0.9416666666666667

Iteration :  4
Accuracy: 0.925

Iteration :  5
Accuracy: 0.9296296296296296

Average Accuracy :  0.9305555555555556


10 features with the most effect were chosen.

In [39]:
print (classifier.show_most_informative_features(20))

Most Informative Features
                   faced = True              pos : neg    =     21.8 : 1.0
             magnificent = True              pos : neg    =     20.5 : 1.0
              uninspired = True              neg : pos    =     18.1 : 1.0
                 italian = True              pos : neg    =     17.9 : 1.0
           unintentional = True              neg : pos    =     16.0 : 1.0
                    skip = True              neg : pos    =     14.7 : 1.0
              incoherent = True              neg : pos    =     14.0 : 1.0
                  tucker = True              pos : neg    =     13.4 : 1.0
                  rehash = True              neg : pos    =     13.3 : 1.0
               stretched = True              neg : pos    =     13.3 : 1.0
                   inane = True              neg : pos    =     13.3 : 1.0
                    bore = True              neg : pos    =     13.0 : 1.0
                 breasts = True              neg : pos    =     12.9 : 1.0

Model evaluation was done using test data.

In [40]:
test_features = []
for i in range(len(X_test)):
    test_features.append((dict([word, True] for word in X_test[i]), y_test[i]))
         
y_pred, gold_labels = defaultdict(set), defaultdict(set)
    
for i, (features, label) in enumerate(test_features):
    y_pred[classifier.classify(features)].add(i)
    gold_labels[label].add(i) 

for label in y_pred:
    print(label, 'Precision:', precision(gold_labels[label], y_pred[label]))
    print(label, 'Recall:', recall(gold_labels[label], y_pred[label]))
    print(label, 'F1-Score:', f_measure(gold_labels[label], y_pred[label]))
accuracy = classify.accuracy(classifier, test_features)
        
print("Accuracy:", accuracy) 

pos Precision: 0.8662420382165605
pos Recall: 0.9645390070921985
pos F1-Score: 0.9127516778523489
neg Precision: 0.965034965034965
neg Recall: 0.8679245283018868
neg F1-Score: 0.9139072847682119
Accuracy: 0.9133333333333333
