# Sentiment Classifier for movie_reviews and twitter_samples dataset

### I have used the bag of words method to train the classifiers where a particular number of words in the datasets have been created into features for each instance containing a True/False value to indicate the presence/absense of words in it. I chose 3000 words for most of the classifiers after removing stop words and punctuations. Further, I tried to increase accuracy of classifiers by considering 3000 most common words for creating features

In [None]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from nltk.classify import ClassifierI
from statistics import mode
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords

## movie_reviews Dataset

###### Storing reviews in a list as a list of words

In [None]:
reviews = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [None]:
print("First Review")
print(reviews[0])
print("\n\n")
print("Last Review")
print(reviews[1999])

First Review
(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'goo

In [None]:
#Shuffling reviews so that classifier doesn't classify based on order
random.shuffle(reviews)

In [None]:
#Extracting all words from all the reviews, removing stop words and punctuations
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())


stop_words = set(stopwords.words('english'))

punct = set(['.','!','(',')','[',']','{','}','<','>',':',';', '-', ',','\'','"'
             , '?'])

stop_words = stop_words.union(punct)
stop_words = list(stop_words)
all_words = [w for w in all_words if not w in stop_words]


In [None]:
#Creating frequency distribution of all_words
all_words1 = nltk.FreqDist(all_words)
all_words1

FreqDist({'plot': 1513,
          'two': 1911,
          'teen': 151,
          'couples': 27,
          'go': 1113,
          'church': 69,
          'party': 183,
          'drink': 32,
          'drive': 105,
          'get': 1949,
          'accident': 104,
          'one': 5852,
          'guys': 268,
          'dies': 104,
          'girlfriend': 218,
          'continues': 88,
          'see': 1749,
          'life': 1586,
          'nightmares': 26,
          'deal': 219,
          'watch': 603,
          'movie': 5771,
          'sorta': 10,
          'find': 782,
          'critique': 61,
          'mind': 451,
          'fuck': 17,
          'generation': 96,
          'touches': 55,
          'cool': 208,
          'idea': 386,
          'presents': 78,
          'bad': 1395,
          'package': 30,
          'makes': 992,
          'review': 295,
          'even': 2565,
          'harder': 33,
          'write': 119,
          'since': 768,
          'generally': 103,
   

In [None]:
#Taking first 3000 words from all_words1 for training classifier
word_features = list(all_words1.keys())[:3000]

#Function to create features that indicate whether particular word is present in review or not
def find_features(review):
    words = set(review)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# Creating feturesets using above function
featuresets = [(find_features(rev), category) for (rev, category) in reviews]


In [None]:
#Splitting data
training = featuresets[:1800]
testing = featuresets[1800:]

In [None]:
#Naive Bayes Classifier

nbclassifier = nltk.NaiveBayesClassifier.train(training)
print("Naive Bayes Classifier Accuracy: ", nltk.classify.accuracy(nbclassifier, testing))
nbclassifier.show_most_informative_features(10)


Naive Bayes Classifier Accuracy:  0.82
Most Informative Features
                  alicia = True              neg : pos    =     11.1 : 1.0
                  regard = True              pos : neg    =     10.3 : 1.0
                   sucks = True              neg : pos    =      9.9 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
                bothered = True              neg : pos    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                   groan = True              neg : pos    =      7.7 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                  crappy = True              neg : pos    =      7.0 : 1.0
                  turkey = True              neg : pos    =      7.0 : 1.0


In [None]:
#Logistic Regression Classifier
LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training)
print("LR_classifier accuracy:", nltk.classify.accuracy(LR_classifier, testing))


LR_classifier accuracy: 0.8


In [None]:
#SVC Classifier
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training)
print("SVC_classifier accuracy: ", nltk.classify.accuracy(SVC_classifier, testing))


SVC_classifier accuracy:  0.785


In [None]:
#Linear SVC Classifer
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training)
print("LinearSVC_classifier accuracy:", nltk.classify.accuracy(LinearSVC_classifier, testing))


LinearSVC_classifier accuracy: 0.8


In [None]:
# Decision Tree Classifier
DT_classifier = SklearnClassifier(DecisionTreeClassifier())
DT_classifier.train(training)
print("DT_classifier accuracy:", nltk.classify.accuracy(DT_classifier, testing))


DT_classifier accuracy: 0.655


In [None]:
#Decision tree with maximum depth 3
DT_classifier1 = SklearnClassifier(DecisionTreeClassifier(max_depth= 3))
DT_classifier1.train(training)
print("DT_classifier1 accuracy:", nltk.classify.accuracy(DT_classifier1, testing))

DT_classifier1 accuracy: 0.655


In [None]:
#Decision tree with maximum depth 5
DT_classifier2 = SklearnClassifier(DecisionTreeClassifier(max_depth= 5))
DT_classifier2.train(training)
print("DT_classifier2 accuracy:", nltk.classify.accuracy(DT_classifier2, testing))

DT_classifier2 accuracy: 0.69


In [None]:
# Creating a voted classifer which takes majority vote of the above classifiers to classify reviews

#Function for voteclassifier
class VoteClassifier(ClassifierI):
    #Initializaton method
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    #Method to classify according to mode
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    #Method to calculate confidence by ratio of majority votes to total votes
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes/len(votes)
        return conf


In [None]:
#Voted classifier with all of the above classifiers
voted_classifier = VoteClassifier(nbclassifier,
                                  LR_classifier,
                                  SVC_classifier,
                                  LinearSVC_classifier,
                                  DT_classifier2)

print("voted_classifier accuracy percent:", nltk.classify.accuracy(voted_classifier, testing))
print("Prediction for first instance in testing")
print("Classification:", voted_classifier.classify(testing[0][0]), "Confidence:",voted_classifier.confidence(testing[0][0]))


voted_classifier accuracy percent: 0.83
Prediction for first instance in testing
Classification: neg Confidence: 0.8


In [None]:
#Voted classifier with some of the above classifiers
voted_classifier = VoteClassifier(nbclassifier,
                                  LR_classifier,
                                  SVC_classifier)

print("voted_classifier accuracy percent:", nltk.classify.accuracy(voted_classifier, testing))
print("Prediction for first instance in testing")
print("Classification:", voted_classifier.classify(testing[0][0]), "Confidence:",voted_classifier.confidence(testing[0][0]))


voted_classifier accuracy percent: 0.83
Prediction for first instance in testing
Classification: neg Confidence: 0.6666666666666666


In [None]:
import pandas as pd

In [None]:
accuracy_table = {'Datasets' : 'movie_reviews',
                'Naive Bayes': nltk.classify.accuracy(nbclassifier, testing),
                'SVM': nltk.classify.accuracy(LinearSVC_classifier, testing),
                'Decision Tree': nltk.classify.accuracy(DT_classifier2, testing),
                'Logistic Regression': nltk.classify.accuracy(LR_classifier, testing)}

In [None]:
accuracy_table = pd.DataFrame([accuracy_table], columns=accuracy_table.keys())

In [None]:
accuracy_table

Unnamed: 0,Datasets,Naive Bayes,SVM,Decision Tree,Logistic Regression
0,movie_reviews,0.82,0.8,0.69,0.8


###### Trying to increase accuracy by taking most common 3000 words as features

In [None]:
#Creating features with 3000 most common words
word_features = list(all_words1.most_common(3000))
word_features = [word_features[i][0] for i in range(len(word_features))]

#Function to create features that indicate whether particular word is present in review or not
def find_features(review):
    words = set(review)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# Creating feturesets using above function
featuresets = [(find_features(rev), category) for (rev, category) in reviews]

#Splitting data
training = featuresets[:1800]
testing = featuresets[1800:]

In [None]:
#Naive Bayes Classifier

nbclassifier = nltk.NaiveBayesClassifier.train(training)
print("Naive Bayes Classifier Accuracy: ", nltk.classify.accuracy(nbclassifier, testing))
nbclassifier.show_most_informative_features(30)


Naive Bayes Classifier Accuracy:  0.79
Most Informative Features
                  seagal = True              neg : pos    =     13.1 : 1.0
                  alicia = True              neg : pos    =     11.1 : 1.0
             outstanding = True              pos : neg    =     10.0 : 1.0
                  finest = True              pos : neg    =      9.8 : 1.0
                   mulan = True              pos : neg    =      7.6 : 1.0
             beautifully = True              pos : neg    =      7.6 : 1.0
            breathtaking = True              pos : neg    =      7.6 : 1.0
                  tucker = True              pos : neg    =      7.4 : 1.0
                  prinze = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.8 : 1.0
                   anger = True              pos : neg    =      6.7 : 1.0
             wonderfully = True    

In [None]:
#Logistic Regression Classifier
LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training)
print("LR_classifier accuracy:", nltk.classify.accuracy(LR_classifier, testing))


LR_classifier accuracy: 0.86


In [None]:
#SVC Classifier
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training)
print("SVC_classifier accuracy: ", nltk.classify.accuracy(SVC_classifier, testing))


SVC_classifier accuracy:  0.775


In [None]:
#Linear SVC Classifer
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training)
print("LinearSVC_classifier accuracy:", nltk.classify.accuracy(LinearSVC_classifier, testing))


LinearSVC_classifier accuracy: 0.835


In [None]:
# Decision Tree Classifier
DT_classifier = SklearnClassifier(DecisionTreeClassifier())
DT_classifier.train(training)
print("DT_classifier accuracy:", nltk.classify.accuracy(DT_classifier, testing))


DT_classifier accuracy: 0.695


In [None]:
#Decision tree with maximum depth 3
DT_classifier2 = SklearnClassifier(DecisionTreeClassifier(max_depth= 3))
DT_classifier2.train(training)
print("DT_classifier2 accuracy:", nltk.classify.accuracy(DT_classifier2, testing))

DT_classifier2 accuracy: 0.66


###### The accuracy is not increasing greatly and also the most important features for nbclassifier have quite a few proper nouns, this classifier won't generalize efficiently. Thus, not making changes to accuracy table for movie_reviews

In [None]:
print("Accuracy Table for movie_reviews Data")
accuracy_table

Accuracy Table for movie_reviews Data


Unnamed: 0,Datasets,Naive Bayes,SVM,Decision Tree,Logistic Regression
0,movie_reviews,0.82,0.8,0.69,0.8


# Twitter Samples

In [None]:
from nltk.corpus import twitter_samples as ts 


In [None]:
#Processing and storing tweets
tweets = ts.tokenized()
tweets = tweets[:10000]

neg_tweets = tweets[:5000]
pos_tweets = tweets[5000:]


###### Following same procedure as done with movie_reviews to create features

In [None]:
all_words = []

for i in range(len(tweets)-1):
    for j in range(len(tweets[i])-1):
        all_words.append(tweets[i][j])

all_words = set(all_words)

stop_words = set(stopwords.words('english'))

punct = set(['.','!','(',')','[',']','{','}','<','>',':',';', '-', ',','\'','"'
             , '?'])

stop_words = stop_words.union(punct)
stop_words = list(stop_words)
all_words = [w for w in all_words if not w in stop_words]

all_words1 = nltk.FreqDist(all_words)

word_features = list(all_words1.keys())[:3000]


def find_features(tweet):
    words = set(tweet)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(tweet), 'neg') for tweet in neg_tweets]

for i in range(len(pos_tweets)):
    featuresets.append((find_features(pos_tweets[i]), 'pos'))

random.shuffle(featuresets)

training = featuresets[:7000]
testing = featuresets[7000:]


In [None]:
#Naive Bayes Classifier
nbclassifier = nltk.NaiveBayesClassifier.train(training)
print("Naive Bayes Classifier Accuracy: ", nltk.classify.accuracy(nbclassifier, testing))
nbclassifier.show_most_informative_features(10)



Naive Bayes Classifier Accuracy:  0.6446666666666667
Most Informative Features
                 arrived = True              pos : neg    =     13.6 : 1.0
               community = True              pos : neg    =     13.0 : 1.0
           @justinbieber = True              neg : pos    =     11.4 : 1.0
                     via = True              pos : neg    =     11.4 : 1.0
              bestfriend = True              pos : neg    =      9.6 : 1.0
                   great = True              pos : neg    =      8.7 : 1.0
                    damn = True              neg : pos    =      8.3 : 1.0
                    Glad = True              pos : neg    =      7.7 : 1.0
                   tired = True              neg : pos    =      6.8 : 1.0
                    WANT = True              neg : pos    =      6.3 : 1.0


In [None]:
#Logistic Regression Classifier
LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training)
print("LR_classifier accuracy:", nltk.classify.accuracy(LR_classifier, testing))


LR_classifier accuracy: 0.645


In [None]:
#SVC Classifier
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training)
print("SVC_classifier accuracy: ", nltk.classify.accuracy(SVC_classifier, testing))


SVC_classifier accuracy:  0.49933333333333335


In [None]:
#Decision Tree Classifier
DT_classifier = SklearnClassifier(DecisionTreeClassifier())
DT_classifier.train(training)
print("DT_classifier accuracy:", nltk.classify.accuracy(DT_classifier, testing))


DT_classifier accuracy: 0.6403333333333333


###### Accuracies look pretty bad for classifiers trained on first 3000 words

### Trying to improve accuracy of model on twitter_dataset by training on 3000 most common words in the dataset

In [None]:
#Following same procedure as done with movie_reviews to create features

all_words = []

for i in range(len(tweets)-1):
    for j in range(len(tweets[i])-1):
        all_words.append(tweets[i][j])


stop_words = set(stopwords.words('english'))

punct = set(['.','!','(',')','[',']','{','}','<','>',':',';', '-', ',','\'','"'
             , '?'])

stop_words = stop_words.union(punct)
stop_words = list(stop_words)
all_words = [w for w in all_words if not w in stop_words]

all_words1 = nltk.FreqDist(all_words)

word_features = all_words1.most_common(3000)
word_features = [word_features[i][0] for i in range(len(word_features))]

def find_features(tweet):
    words = set(tweet)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(tweet), 'neg') for tweet in neg_tweets]

for i in range(len(pos_tweets)):
    featuresets.append((find_features(pos_tweets[i]), 'pos'))

random.shuffle(featuresets)

training = featuresets[:7000]
testing = featuresets[7000:]

In [None]:
#Naive Bayes Classifier
nbclassifier = nltk.NaiveBayesClassifier.train(training)
acc1 = nltk.classify.accuracy(nbclassifier, testing)
print("Naive Bayes Classifier Accuracy: ", acc1)
nbclassifier.show_most_informative_features(10)

Naive Bayes Classifier Accuracy:  0.9966666666666667
Most Informative Features
                      :( = True              neg : pos    =   2078.1 : 1.0
                      :) = True              pos : neg    =   1641.7 : 1.0
                     sad = True              neg : pos    =     29.9 : 1.0
                     See = True              pos : neg    =     29.8 : 1.0
                    miss = True              neg : pos    =     28.8 : 1.0
                  THANKS = True              neg : pos    =     24.1 : 1.0
                  FOLLOW = True              neg : pos    =     22.7 : 1.0
                   Thank = True              pos : neg    =     22.2 : 1.0
                 arrived = True              pos : neg    =     19.8 : 1.0
                     x15 = True              neg : pos    =     19.3 : 1.0


In [None]:
nbclassifier.show_most_informative_features(50)

Most Informative Features
                      :( = True              neg : pos    =   2078.1 : 1.0
                      :) = True              pos : neg    =   1641.7 : 1.0
                     sad = True              neg : pos    =     29.9 : 1.0
                     See = True              pos : neg    =     29.8 : 1.0
                    miss = True              neg : pos    =     28.8 : 1.0
                  THANKS = True              neg : pos    =     24.1 : 1.0
                  FOLLOW = True              neg : pos    =     22.7 : 1.0
                   Thank = True              pos : neg    =     22.2 : 1.0
                 arrived = True              pos : neg    =     19.8 : 1.0
                     x15 = True              neg : pos    =     19.3 : 1.0
                  Thanks = True              pos : neg    =     17.2 : 1.0
                 welcome = True              pos : neg    =     15.5 : 1.0
                   loves = True              pos : neg    =     15.5 : 1.0

In [None]:
#Logistic Regression Classifier
LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training)
acc2 = nltk.classify.accuracy(LR_classifier, testing)
print("LR_classifier accuracy:", acc2)

LR_classifier accuracy: 0.996


In [None]:
#SVC Classifier
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training)
acc3 = nltk.classify.accuracy(SVC_classifier, testing)
print("SVC_classifier accuracy: ", acc3)


SVC_classifier accuracy:  0.9896666666666667


In [None]:
#Decision Tree Classifier
DT_classifier = SklearnClassifier(DecisionTreeClassifier())
DT_classifier.train(training)
acc4 = nltk.classify.accuracy(DT_classifier, testing)
print("DT_classifier accuracy:", acc4 )

DT_classifier accuracy: 0.995


In [None]:
acc_tab = {'Datasets' : 'twitter_dataset',
                'Naive Bayes': acc1,
                'SVM': acc3,
                'Decision Tree': acc4,
                'Logistic Regression': acc2}

acc_tab = pd.DataFrame([acc_tab], columns=acc_tab.keys())

# Final Output

In [None]:
print("Accuracy table for movie_reviews data")
accuracy_table

Accuracy table for movie_reviews data


Unnamed: 0,Datasets,Naive Bayes,SVM,Decision Tree,Logistic Regression
0,movie_reviews,0.82,0.8,0.69,0.8


In [None]:
print("Accuracy table for twitter_samples data")
acc_tab

Accuracy table for twitter_samples data


Unnamed: 0,Datasets,Naive Bayes,SVM,Decision Tree,Logistic Regression
0,twitter_dataset,0.996667,0.989667,0.995,0.996
