## Sentiment Analysis for the IMDb Movie Reviews dataset         

### Using BOW technique based on unigrams

In [13]:
from nltk.corpus import movie_reviews
import re,string
from bs4 import BeautifulSoup
from random import shuffle 
from collections import defaultdict

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

from nltk import FreqDist

from nltk import classify, NaiveBayesClassifier

from nltk.metrics.scores import precision, recall, f_measure

This corpus has 2000 reviews, with 1000 having the 'pos' label and the remaining 1000 having the 'neg' label.

In [2]:
print ("Total reviews:", len(movie_reviews.fileids())) 
print ("Review categories:", movie_reviews.categories()) 
print ("Total positive reviews:", len(movie_reviews.fileids('pos'))) 
print ("Total negative reviews:", len(movie_reviews.fileids('neg'))) 
positive_review_file = movie_reviews.fileids('neg')[1] 
print ("Sample review file:", positive_review_file) 
#print("\nContent of this sample review is :", movie_reviews.raw(positive_review_file))

Total reviews: 2000
Review categories: ['neg', 'pos']
Total positive reviews: 1000
Total negative reviews: 1000
Sample review file: neg/cv001_19502.txt


#### Preprocessing step

The method below accepts a movie review as an input argument and returns tokens as an output.<br>
Different parts of preprocessing step are: <br>
- removing html tags
- removing sentences between [] or () that were extra explanation
- removing punctuation marks
- removing stop words
- extracting lemma and replace the main words

In [3]:
def clean_text(text):
    # removing the html strips
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    # removing the square [] , {} , () , «»
    text = re.sub('\[[^]]*\]', ' ', text)
    text = re.sub('\([^]]*\)', ' ', text)
    text = re.sub('\«[^]]*\»', ' ', text)
    text = re.sub('\{[^]]*\}', ' ', text)
    tokens = word_tokenize(text)
    # Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    # Lower the tokens
    tokens = [word.lower() for word in tokens]
    # Remove stopword
    tokens = [word for word in tokens if not word in stopwords.words("english")]
    # Lemmatize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos = "v") for word in tokens]
    tokens = [lemma.lemmatize(word, pos = "n") for word in tokens]
    return tokens

In [4]:
pos_review_fileids = []
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    pos_review_fileids.append(fileid)
    fileid_words_clean = clean_text(movie_reviews.raw(fileid))
    pos_reviews.append(fileid_words_clean)

neg_review_fileids = []
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    neg_review_fileids.append(fileid)
    fileid_words_clean = clean_text(movie_reviews.raw(fileid))
    neg_reviews.append(fileid_words_clean)


In [5]:
#print("Raw text of sample review:\n", (movie_reviews.raw(pos_review_fileids[0])))
print("\nBOW of sample review:\n", (pos_reviews[0])) 


BOW of sample review:
 ['film', 'adapt', 'comic', 'book', 'plenty', 'success', 'whether', 'superheroes', 'log', 'great', 'support', 'role', 'big', 'surprise', 'graham', 'cringe', 'first', 'time', 'open', 'mouth', 'imagine', 'attempt', 'irish', 'accent', 'actually', 'half', 'bad', 'film', 'however', 'good', 'r', 'strong', 'sexuality', 'language', 'drug', 'content']


#### Feature Extraction step

In this step, tokens regardless of their places, are assuemd as features.

In [6]:
def bag_of_words(words):
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [7]:
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_words(words), 'pos'))

neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_words(words), 'neg'))

print ("\nSample positive review:\n",pos_reviews_set[0])
print ("\nSample negative review:\n", neg_reviews_set[0])


Sample positive review:
 ({'film': True, 'adapt': True, 'comic': True, 'book': True, 'plenty': True, 'success': True, 'whether': True, 'superheroes': True, 'log': True, 'great': True, 'support': True, 'role': True, 'big': True, 'surprise': True, 'graham': True, 'cringe': True, 'first': True, 'time': True, 'open': True, 'mouth': True, 'imagine': True, 'attempt': True, 'irish': True, 'accent': True, 'actually': True, 'half': True, 'bad': True, 'however': True, 'good': True, 'r': True, 'strong': True, 'sexuality': True, 'language': True, 'drug': True, 'content': True}, 'pos')

Sample negative review:
 ({'plot': True, 'two': True, 'teen': True, 'couple': True, 'go': True, 'church': True, 'party': True, 'drink': True, 'drive': True, 'get': True, 'accident': True, 'one': True, 'guy': True, 'die': True, 'girlfriend': True, 'continue': True, 'see': True, 'life': True, 'nightmare': True, 'deal': True, 'watch': True, 'movie': True, 'sorta': True, 'find': True, 'critique': True, 'generation': Tr

#### Training step

The first 1600 words were selected for training, while the final 400 words were chosen for testing.

In [8]:
all_features = pos_reviews_set + neg_reviews_set
shuffle(all_features)

X_all = [list(item[0]) for item in all_features]
y_all = [item[1] for item in all_features]

X_train = X_all[:1600]
y_train = y_all[:1600]
X_test = X_all[1600:]
y_test = y_all[1600:]
print("Train Size:", len(X_train), "\nTest Size:", len(X_test))

Train Size: 1600 
Test Size: 400


A technique for dividing data into test and train sets.

In [9]:
def k_fold_generator(X, y, k_fold):
    subset_size = int(len(X) / k_fold)
    #print("subset_size:", subset_size)
    for k in range(k_fold):
        X_train = X[:k * subset_size] + X[(k + 1) * subset_size:]
        X_valid = X[k * subset_size:][:subset_size]
        y_train = y[:k * subset_size] + y[(k + 1) * subset_size:]
        y_valid = y[k * subset_size:][:subset_size]

        yield X_train, y_train, X_valid, y_valid

The NaiveBayes model was called five times for various portions of the data, with fold=5 being assumed as the input argument for the above technique. <br> In the end, the accuracy was determined by averaging them.

In [10]:
k_fold=5
counter=0
sumAccuracy = 0
for X_fold_train, y_fold_train, X_fold_valid, y_fold_valid in k_fold_generator(X_train, y_train, k_fold):
    #print(len(X_fold_train), len(y_fold_train), len(X_fold_valid), len(y_fold_valid))
    counter += 1
    print("\nIteration : ", counter)
    
    train_features = []
    for i in range(len(X_fold_train)):
        train_features.append((dict([word, True] for word in X_fold_train[i]), y_fold_train[i]))
        
    classifier = NaiveBayesClassifier.train(train_features) 
    classifier.train
    
    test_features = []
    for i in range(len(X_fold_valid)):
        test_features.append((dict([word, True] for word in X_fold_valid[i]), y_fold_valid[i]))  
            
    accuracy = classify.accuracy(classifier, test_features)
    sumAccuracy += accuracy
    print("Accuracy:", accuracy) 

print("\nAverage Accuracy : ", sumAccuracy/k_fold)


Iteration :  1
Accuracy: 0.75625

Iteration :  2
Accuracy: 0.76875

Iteration :  3
Accuracy: 0.775

Iteration :  4
Accuracy: 0.725

Iteration :  5
Accuracy: 0.759375

Average Accuracy :  0.756875


10 features with the most effect were chosen.

In [11]:
print (classifier.show_most_informative_features(10))

Most Informative Features
             outstanding = True              pos : neg    =     14.7 : 1.0
                 cameron = True              pos : neg    =     14.1 : 1.0
                  truman = True              pos : neg    =     10.8 : 1.0
            breathtaking = True              pos : neg    =     10.8 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
                  breast = True              neg : pos    =      9.8 : 1.0
                 justice = True              pos : neg    =      9.5 : 1.0
             wonderfully = True              pos : neg    =      8.8 : 1.0
                 italian = True              pos : neg    =      8.2 : 1.0
                  smooth = True              pos : neg    =      8.2 : 1.0
None


Model evaluation was done using test data.

In [14]:
test_features = []
for i in range(len(X_test)):
    test_features.append((dict([word, True] for word in X_test[i]), y_test[i]))
         
y_pred, gold_labels = defaultdict(set), defaultdict(set)
    
for i, (features, label) in enumerate(test_features):
    y_pred[classifier.classify(features)].add(i)
    gold_labels[label].add(i) 

for label in y_pred:
    print(label, 'Precision:', precision(gold_labels[label], y_pred[label]))
    print(label, 'Recall:', recall(gold_labels[label], y_pred[label]))
    print(label, 'F1-Score:', f_measure(gold_labels[label], y_pred[label]))
accuracy = classify.accuracy(classifier, test_features)
        
print("Accuracy:", accuracy) 

pos Precision: 0.683206106870229
pos Recall: 0.9040404040404041
pos F1-Score: 0.7782608695652175
neg Precision: 0.8623188405797102
neg Recall: 0.5891089108910891
neg F1-Score: 0.7
Accuracy: 0.745
