# Sentiment Analysis using NLTK Part-1

### The goal here is to predict the sentiments behind movie reviews.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import shuffle
from nltk.corpus import movie_reviews as mr
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords

## Preparing Data

We will convert the **CategorizedPlaintextCorpusReader** data into **python list**

In [2]:
reviews = []
for fileid in mr.fileids():
    tag, filename = fileid.split('/')
    reviews.append([mr.raw(fileid), tag])

In [3]:
len(reviews)

2000

In our dataset first 1000 reviews are negative and later are positives. So we will shuffle the data using **shuffle**

In [4]:
shuffle(reviews)

In [5]:
reviews[1768]

['tom dicillo directs this superficial comedy about superficial people in superficial careers , all searching for deeper meaning . \nhowever , they won\'t find much meaning in the real blonde , and not enough real humor , either . \njoe ( matthew modine ) is a struggling actor , or he claims to be one , even though he has no credits under his belt . \nhis girlfriend , mary ( catherine keener ) has no pretensions about her career : she\'s a makeup artist , working for the eccentric fashion photographer blair ( marlo thomas ) and supermodel of the month , sahara ( bridgette wilson ) . \nsahara , who has acquired a new age spirituality from repeated viewing of the little mermaid , has an on-again , off-again relationship with joe\'s best friend , bob ( maxwell caulfield ) . \nhowever , bob has gotten his biggest break yet : a starring role on a soap opera opposite the beautiful kelly ( daryl hannah ) , who may be that illusive woman he\'s always pursued : a real blonde . \nthere are some 

Now we divide the dataset into train and test. 

In [6]:
train=reviews[:1500]
test=reviews[1500:]

We will create a vocabulary for each review and use it to get unigram features from the data

In [7]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in train for x in tagged_sent[0].split()]
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
#print(vocabulary)

44646


In [8]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

In [9]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

In [10]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [11]:
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [12]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [13]:
def real_time_test(classifier,vocab):
    print("Enter a sentence: ")
    inp = input()
    feat_vec_uni = get_unigram_features(inp,vocab)
    feat_vec_swn =get_senti_wordnet_features(test_data)
    feat_vec = merge_features(feat_vec_uni, feat_vec_swn)

    predict = classifier.predict(feat_vec)
    if predict[0]==1:
        print("The sentiment expressed is: positive")
    else:
        print("The sentiment expressed is: negative") 

In [14]:
training_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_labels = get_lables(train)

test_features = get_unigram_features(test,vocabulary)
test_gold_labels = get_lables(test)

In [15]:
#training_features[0]

In [19]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(alpha=0.1).fit(training_features,training_labels) #training process

print("Precision of NB classifier is")
predictions = nb_classifier.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = nb_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of NB classifier is
Training data	0.9946666666666667
Test data	0.8


In [23]:
from sklearn.svm import SVC
clf = SVC(C=0.01, kernel='sigmoid').fit(training_features,training_labels)
print("Precision of SVM classifier is")
predictions = clf.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = clf.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of SVM classifier is
Training data	0.5066666666666667
Test data	0.48


In [24]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, max_features='sqrt', random_state=42).fit(training_features,training_labels)
print("Precision of Decision Tree classifier is")
predictions = clf.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = clf.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of Decision Tree classifier is
Training data	0.64
Test data	0.574


In [25]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42).fit(training_features,training_labels)
print("Precision of Logistic Regression is")
predictions = clf.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = clf.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of Logistic Regression is
Training data	1.0
Test data	0.846


# Sentiment Analysis using NLTK Part-2

### The goal here is to predict the sentiments behind tweets.

In [68]:
from nltk.corpus import twitter_samples as ts
import json

In [69]:
mypath= 'C:/Users/Sociopath/AppData/Roaming/nltk_data/corpora/twitter_samples/'
pos = []
for line in open (mypath + r'positive_tweets.json', 'r'):
    pos.append(json.loads(line))

neg = []
for line in open (mypath + r'negative_tweets.json', 'r'):
    neg.append(json.loads(line))

In [70]:
len(pos)

5000

In [71]:
#colname = ["created_at", "favorite_count", "retweet_count", "id", "place", "coordinates", "geo", "text"]

df0 = []
for dic in pos:
    ll = []
    ll.append(dic['text'])
    ll.append("pos")
    df0.append(ll)

df1 = []
for dic in neg:
    ll = []
    ll.append(dic['text'])
    ll.append("neg")
    df1.append(ll)

In [74]:
df0.extend(df1)

In [77]:
df=df0
len(df)

10000

In [78]:
shuffle(df)

In [80]:
train=df[:7000]
test=df[7000:]

In [81]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in train for x in tagged_sent[0].split()]
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
#print(vocabulary)

19642


In [82]:
training_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_labels = get_lables(train)

test_features = get_unigram_features(test,vocabulary)
test_gold_labels = get_lables(test)

In [83]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(alpha=0.1).fit(training_features,training_labels) #training process

print("Precision of NB classifier is")
predictions = nb_classifier.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = nb_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of NB classifier is
Training data	0.9997142857142857
Test data	0.9893333333333333


In [84]:
from sklearn.svm import SVC
clf = SVC(C=0.01, kernel='sigmoid').fit(training_features,training_labels)
print("Precision of SVM classifier is")
predictions = clf.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = clf.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of SVM classifier is
Training data	0.5007142857142857
Test data	0.49833333333333335


In [85]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, max_features='sqrt', random_state=42).fit(training_features,training_labels)
print("Precision of Decision Tree classifier is")
predictions = clf.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = clf.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of Decision Tree classifier is
Training data	0.5511428571428572
Test data	0.5616666666666666


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42).fit(training_features,training_labels)
print("Precision of Logistic Regression is")
predictions = clf.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = clf.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

In [None]:
print('''
(Dataset)        (Naive Bayes)    (SVM)   (Decision-tree)   (Logistic-Regression) 

movie_review        {}              {}          {}               {}

twitter_dataset     {}              {}          {}               {}

'''*.format())