# Sentiment Analysis using NLTK

### The goal here is to predict the sentiments behind movie reviews.

In [71]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import shuffle
from nltk.corpus import movie_reviews as mr
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords

## Preparing Data

We will convert the **CategorizedPlaintextCorpusReader** data into **python list**

In [72]:
reviews = []
for fileid in mr.fileids():
    tag, filename = fileid.split('/')
    reviews.append([mr.raw(fileid), tag])

In [73]:
len(reviews)

2000

In our dataset first 1000 reviews are negative and later are positives. So we will shuffle the data using **shuffle**

In [74]:
shuffle(reviews)

In [75]:
reviews[1768]

['one year has passed since the last time we saw them , and but wayne campbell and garth algar are back but , they\'ve moved out of their parents homes , and now have their own pad in an abandoned warehouse . \nwayne\'s still hopelessly infatuated with his girlfriend , cassandra , and of course garth is still having his own , personal growing pains . \nthey guys are just finishing off one of their shows , and head off to an aerosmith concert where they run across cassandra and her new manager , a stereotypical industry slime not unlike the character rob lowe portrayed in the previous film . \nagain , the promoter is not only after the money that cassandra\'s talent can bring in . \nthe storyline quickly degrades to a parallel of the original , " guy loses chick , guy realizes mistake , and guy gets chick back . " \nfortunately , there is more to this movie than this somewhat sheer plot . \nthe wit and humor in this film are surprisingly intelligent and fresh with many inside jokes on t

Now we divide the dataset into train and test. 

In [76]:
train=reviews[:1500]
test=reviews[1500:]

We will create a vocabulary for each review and use it to get unigram features from the data

In [77]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in train for x in tagged_sent[0].split()]
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
#print(vocabulary)

44131


In [78]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

In [79]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

In [80]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [81]:
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [82]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [83]:
def real_time_test(classifier,vocab):
    print("Enter a sentence: ")
    inp = input()
    feat_vec_uni = get_unigram_features(inp,vocab)
    feat_vec_swn =get_senti_wordnet_features(test_data)
    feat_vec = merge_features(feat_vec_uni, feat_vec_swn)

    predict = classifier.predict(feat_vec)
    if predict[0]==1:
        print("The sentiment expressed is: positive")
    else:
        print("The sentiment expressed is: negative") 

In [84]:
training_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_labels = get_lables(train)

test_features = get_unigram_features(test,vocabulary)
test_gold_labels = get_lables(test)

In [85]:
training_features[0]

[0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [86]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB().fit(training_features,training_labels) #training process

print("Precision of NB classifier is")
predictions = nb_classifier.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = nb_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of NB classifier is
Training data	0.9813333333333333
Test data	0.83
