#### Despite coming packed with some classifiers, NLTK is mainly a toolkit focused on natural language processing, and not machine learning specifically. 
#### A module that is focused on machine learning is scikit-learn, which is packed with a large array of machine learning algorithms which are optimized in C. 

#### Luckily NLTK has recognized this and comes packaged with a special classifier that wraps around scikit learn. In NLTK, this is: nltk.classify.scikitlearn, specifically the class:  SklearnClassifier is what we're interested in.
#### This allows us to port over any of the scikit-learn classifiers that are compatible, which is most.

### Here we are Saving the models to reduce the time next time we use it

In [2]:
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from nltk.classify import ClassifierI
from statistics import mode


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers= classifiers
    
    def classify(self, features):
        votes= []
        for c in self._classifiers:
            v= c.classify(features)
            votes.append(v)
        try:    
            return mode(votes)
        except Exception as e:
            print(str(e))
    
    def confidence(self, features):
        votes= []
        for c in self._classifiers:
            v= c.classify(features)
            votes.append(v)
        choice_votes= votes.count(mode(votes))
        conf= choice_votes/ len(votes)
        return conf

documents = []


for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# first 1000 fileids in corpora are positive sentiment and next 1000 are negative, so random shuffle is used
random.shuffle(documents)    
#print(documents[2])        

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)  # converting list of words to frequency distribution dictionary

# taking only top 3000 words as features
word_features = [w[0] for w in all_words.most_common(3000)]

# function to find words which are present in a document
def find_features(document):
    words= set(document)   # this is all unique words in the document
    features = {}
    for w in word_features:
        features[w] = (w in words)  # boolean of whether w in present in the document
        
    return features 

featureset = [(find_features(rev), category) for (rev, category) in documents]

training_set = featureset[:1900]
testing_set = featureset[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f= open("naivebayes.pickle", 'rb')
classifier= pickle.load(classifier_f)
classifier_f.close()


print("Original Naive bayes Accuracy is : ", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier Accuracy is : ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
#save the MNB_classifier
MNB_classifier_f = open("MultinomialNB.pickle", 'wb')
pickle.dump(MNB_classifier, MNB_classifier_f)
MNB_classifier_f.close()

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BNB_classifier Accuracy is : ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)
#save the BNB_classifier
BNB_classifier_f = open("BinomialNB.pickle", 'wb')
pickle.dump(BNB_classifier, BNB_classifier_f)
BNB_classifier_f.close()


LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training_set)
print("LR_classifier Accuracy is : ", (nltk.classify.accuracy(LR_classifier, testing_set))*100)
#save the LR_classifier
LR_classifier_f = open("LogisticRegression.pickle", 'wb')
pickle.dump(LR_classifier, LR_classifier_f)
LR_classifier_f.close()


SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(training_set)
print("SGD_classifier Accuracy is : ", (nltk.classify.accuracy(SGD_classifier, testing_set))*100)
#save the SGD_classifier
SGD_classifier_f = open("StochasticGD.pickle", 'wb')
pickle.dump(SGD_classifier, SGD_classifier_f)
SGD_classifier_f.close()


SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier Accuracy is : ", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
#save the SVC_classifier
SVC_classifier_f = open("SVC.pickle", 'wb')
pickle.dump(SVC_classifier, SVC_classifier_f)
SVC_classifier_f.close()


LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier Accuracy is : ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
#save the LinearSVC_classifier
LinearSVC_classifier_f = open("LinearSVC.pickle", 'wb')
pickle.dump(LinearSVC_classifier, LinearSVC_classifier_f)
LinearSVC_classifier_f.close()



voted_classifier= VoteClassifier(LinearSVC_classifier, SVC_classifier, SGD_classifier, 
                                 BNB_classifier, MNB_classifier, classifier, LR_classifier)
print("voted_classifier Accuracy is : ", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification : ", voted_classifier.classify(testing_set[0][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[0][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[1][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[1][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[2][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[2][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[3][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[3][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[4][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[4][0])*100)

Original Naive bayes Accuracy is :  85.0
Most Informative Features
             outstanding = True              pos : neg    =     11.2 : 1.0
                   mulan = True              pos : neg    =      8.4 : 1.0
                   damon = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.8 : 1.0
                  finest = True              pos : neg    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.4 : 1.0
             wonderfully = True              pos : neg    =      7.3 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                   inept = True              neg : pos    =      6.1 : 1.0
                   jolie = True              neg : pos    =      5.8 : 1.0
                 flubber = True              neg : pos    =      5.6 : 1.0
                    lame = True              neg : pos    =      5.6 : 1.0
                   waste = True  



SGD_classifier Accuracy is :  78.0
SVC_classifier Accuracy is :  76.0
LinearSVC_classifier Accuracy is :  82.0
voted_classifier Accuracy is :  81.0
Classification :  neg Confidence % :  85.71428571428571
Classification :  neg Confidence % :  100.0
Classification :  neg Confidence % :  100.0
Classification :  pos Confidence % :  71.42857142857143
Classification :  neg Confidence % :  100.0


#### Now that we have many classifiers, what if we created a new classifier, which combined the votes of all of the classifiers, and then classified the text whatever the majority vote was? 
#### Turns out, doing this is super easy. NLTK has considered this in advance, allowing us to inherit from their ClassifierI class from nltk.classify, which will give us the attributes of a classifier, yet allow us to write our own custom classifier code

In [3]:
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from nltk.classify import ClassifierI
from statistics import mode


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers= classifiers
    
    def classify(self, features):
        votes= []
        for c in self._classifiers:
            v= c.classify(features)
            votes.append(v)
        try:    
            return mode(votes)
        except Exception as e:
            print(str(e))
    
    def confidence(self, features):
        votes= []
        for c in self._classifiers:
            v= c.classify(features)
            votes.append(v)
        choice_votes= votes.count(mode(votes))
        conf= choice_votes/ len(votes)
        return conf

documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

# first 1000 fileids in corpora are positive sentiment and next 1000 are negative, so random shuffle is used
random.shuffle(documents)    
#print(documents[2])        

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)  # converting list of words to frequency distribution dictionary

# taking only top 3000 words as features
word_features = [w[0] for w in all_words.most_common(3000)]

# function to find words which are present in a document
def find_features(document):
    words= set(document)   # this is all unique words in the document
    features = {}
    for w in word_features:
        features[w] = (w in words)  # boolean of whether w in present in the document
        
    return features 

featureset = [(find_features(rev), category) for (rev, category) in documents]

training_set = featureset[:1900]
testing_set = featureset[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f= open("naivebayes.pickle", 'rb')
classifier= pickle.load(classifier_f)
classifier_f.close()


print("Original Naive bayes Accuracy is : ", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

#MNB_classifier = SklearnClassifier(MultinomialNB())
#MNB_classifier.train(training_set)
MNB_classifier_f= open("MultinomialNB.pickle", 'rb')
MNB_classifier= pickle.load(MNB_classifier_f)
MNB_classifier_f.close()
print("MNB_classifier Accuracy is : ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)


#BNB_classifier = SklearnClassifier(BernoulliNB())
#BNB_classifier.train(training_set)
BNB_classifier_f= open("BinomialNB.pickle", 'rb')
BNB_classifier= pickle.load(BNB_classifier_f)
BNB_classifier_f.close()
print("BNB_classifier Accuracy is : ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100)

#LR_classifier = SklearnClassifier(LogisticRegression())
#LR_classifier.train(training_set)
LR_classifier_f= open("LogisticRegression.pickle", 'rb')
LR_classifier= pickle.load(LR_classifier_f)
LR_classifier_f.close()
print("LR_classifier Accuracy is : ", (nltk.classify.accuracy(LR_classifier, testing_set))*100)

#SGD_classifier = SklearnClassifier(SGDClassifier())
#SGD_classifier.train(training_set)
SGD_classifier_f= open("StochasticGD.pickle", 'rb')
SGD_classifier= pickle.load(SGD_classifier_f)
SGD_classifier_f.close()
print("SGD_classifier Accuracy is : ", (nltk.classify.accuracy(SGD_classifier, testing_set))*100)


#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
SVC_classifier_f= open("SVC.pickle", 'rb')
SVC_classifier= pickle.load(SVC_classifier_f)
SVC_classifier_f.close()
print("SVC_classifier Accuracy is : ", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)


#LinearSVC_classifier = SklearnClassifier(LinearSVC())
#LinearSVC_classifier.train(training_set)
LinearSVC_classifier_f= open("LinearSVC.pickle", 'rb')
LinearSVC_classifier= pickle.load(LinearSVC_classifier_f)
LinearSVC_classifier_f.close()
print("LinearSVC_classifier Accuracy is : ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)



voted_classifier= VoteClassifier(LinearSVC_classifier, SVC_classifier, SGD_classifier, 
                                 BNB_classifier,MNB_classifier, classifier, LR_classifier)
print("voted_classifier Accuracy is : ", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification : ", voted_classifier.classify(testing_set[0][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[0][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[1][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[1][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[2][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[2][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[3][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[3][0])*100)
print("Classification : ", voted_classifier.classify(testing_set[4][0]), "Confidence % : ",
      voted_classifier.confidence(testing_set[4][0])*100)

Original Naive bayes Accuracy is :  95.0
Most Informative Features
             outstanding = True              pos : neg    =     11.2 : 1.0
                   mulan = True              pos : neg    =      8.4 : 1.0
                   damon = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.8 : 1.0
                  finest = True              pos : neg    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.4 : 1.0
             wonderfully = True              pos : neg    =      7.3 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                   inept = True              neg : pos    =      6.1 : 1.0
                   jolie = True              neg : pos    =      5.8 : 1.0
                 flubber = True              neg : pos    =      5.6 : 1.0
                    lame = True              neg : pos    =      5.6 : 1.0
                   waste = True  