In [1]:
import nltk
import random
from nltk.corpus import movie_reviews

In [8]:
# defining here a list of tuples with first field as word and second its category 
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

# so here we print some momvie review
print(documents[1])

(['renee', 'zellweger', 'stars', 'as', 'sonia', ',', 'a', 'young', 'jewish', 'wife', 'and', 'mother', 'frustrated', 'by', 'the', 'constraints', 'of', 'her', 'hasidic', 'community', 'in', 'brooklyn', '.', 'her', 'husband', '(', 'glenn', 'fitzgerald', ')', 'is', 'a', 'religious', 'scholar', 'whose', 'all', '-', 'in', '-', 'a', '-', 'day', "'", 's', '-', 'work', 'attitude', 'on', 'sex', 'fails', 'to', 'tame', 'the', '"', 'fire', '"', 'she', 'feels', 'within', ',', 'as', 'so', 'she', 'confesses', 'to', 'the', 'rebbe', '(', 'after', 'hearing', 'her', 'fiery', 'confession', ',', 'the', 'rebbe', 'suddenly', 'gets', 'frisky', 'with', 'his', 'pleasantly', 'surprised', 'wife', '--', 'and', 'dies', 'the', 'next', 'morning', ')', '.', 'sensing', 'her', 'frustration', ',', 'her', 'husband', "'", 's', 'brother', '(', 'christopher', 'eccleston', ')', 'gives', 'her', 'a', 'job', 'in', 'his', 'jewelry', 'brokering', 'business', 'in', 'exchange', 'for', 'raw', ',', 'passionless', 'sex', 'that', 'just', 

In [9]:
# all_words just all words in the set and convert to lowercase
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

#stores the words with most frequencies
all_words = nltk.FreqDist(all_words)

#to print how many times top 15 appeared words appear
print(all_words.most_common(15))

#to specifically find occurence of a word
print(all_words["brilliant"])

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
176


In [15]:
# lists out upto the limit given
word_features = list(all_words.keys())[:3000]
#print(word_features)

def find_features(document):
    words = set(document)     #all the unique words listed out here
    features = {}
    
    # here in this loop we basically check wether the words in word_features are there in words and return boolean
    for w in word_features:
        features[w] = (w in words)

    return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
print(featuresets[1])



In [22]:
# naive bayes algo is  a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature. 
#For example, a fruit may be considered to be an apple if it is red, round, and about 3 inches in diameter. 
#Even if these features depend on each other or upon the existence of the other features, all of these properties independently contribute to the probability that this fruit is an apple and that is why it is known as ‘Naive’.

# set that we'll train our classifier with
training_set = featuresets[:1800]

# set that we'll test against.
testing_set = featuresets[1800:]

# since this algo is really basic we can scale this algo to really large proportions
# it is mathematically the bayes theorem for probability

# training the data
classifier = nltk.NaiveBayesClassifier.train(training_set)

# testing the accuracy of trained data
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

# prints 15 most valuable word in the classification
classifier.show_most_informative_features(15)
# basically it means like "sucks" appear 9.4 times in negative review than in positive review
# here accuracy isnt very high as compared to some other algos

Classifier accuracy percent: 79.5
Most Informative Features
                   sucks = True              neg : pos    =      9.4 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.8 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
               pregnancy = True              neg : pos    =      6.3 : 1.0
               atrocious = True              neg : pos    =      6.2 : 1.0
                  turkey = True              neg : pos    =      5.9 : 1.0
                  stinks = True              neg : pos    =      5.8 : 1.0
                  suvari = True              neg : pos    =      5.7 : 1.0
            surveillance = True         

In [None]:
# # saving classifier using pickle
# save_classifier = open("naivebayes.pickle","wb")
# pickle.dump(classifier, save_classifier)
# save_classifier.close()

# # loading from classifier
# classifier_f = open("naivebayes.pickle", "rb")
# classifier = pickle.load(classifier_f)
# classifier_f.close()

In [23]:
# this one acts like api for using scikit learn in nltk
from nltk.classify.scikitlearn import SklearnClassifier

# MultinomialNB is multinomial distribution not a binary distribution
# BernoulliNB is also another implementation
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

# MultinomialNB
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

# BernoulliNB
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.81
BernoulliNB accuracy percent: 0.79


In [24]:
# this 2 are classifiers
from sklearn.linear_model import LogisticRegression,SGDClassifier

# SVC is support vector classifier, Linear is just Linear and Nu is we can specify number of support vectors 
from sklearn.svm import SVC, LinearSVC, NuSVC


print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 79.5
Most Informative Features
                   sucks = True              neg : pos    =      9.4 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.8 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
               pregnancy = True              neg : pos    =      6.3 : 1.0
               atrocious = True              neg : pos    =      6.2 : 1.0
                  turkey = True              neg : pos    =      5.9 : 1.0
                  stinks = True              neg : pos    =      5.8 : 1.0
                  suvari = True              neg : pos    =      5.7 : 1.0
            surveillance 



SGDClassifier_classifier accuracy percent: 77.5
SVC_classifier accuracy percent: 71.5
LinearSVC_classifier accuracy percent: 77.0
NuSVC_classifier accuracy percent: 79.5


In [26]:
# vote classifier we just saw all these classifier now and this can be used to give a voting to classify a group to get more accuracy

# this is so we can inherit from the nltk classifier class
from nltk.classify import ClassifierI

# this helps us choose the number of votes
from statistics import mode

class VoteClassifier(ClassifierI):
    #passing a list of classifier to this classifier
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    
    def classify(self, features):
        votes = []
        # what happens here is that we go through the classifier and classify by features and append to existing votes
        # then return max occuring one ie mode
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes)) # how much popular that vote is
        conf = choice_votes / len(votes)        #confidence
        return conf

In [27]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent: 81.5
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 71.42857142857143
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
