In [None]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize

In [None]:
short_pos = open("short_reviews/positive.txt",'r').read()
short_neg = open("short_reviews/negative.txt",'r').read()

In [None]:
documents = []
for r in short_pos.split('\n'):
    documents.append((r,"pos"))
for r in short_neg.split('\n'):
    documents.append((r,"neg"))

In [None]:
all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

In [None]:
for w in short_pos_words:
    all_words.append(w.lower())
    
for w in short_neg_words:
    all_words.append(w.lower())

In [None]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(10))

In [None]:
word_features = list(all_words.keys())[:5000]

In [None]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [None]:
featuresets = [(find_features(rev), category) 
               for (rev, category) in documents]

In [None]:
random.shuffle(featuresets)

In [None]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print((nltk.classify.accuracy(classifier, testing_set))*100)

In [None]:
classifier.show_most_informative_features(15)

In [None]:
#import pickle

#save = open("naivebayes.pkl", "wb")
#pickle.dump(classifier,save)
#save.close()

In [None]:
#class_file = open("naivebayes.pkl", "rb")
#classifier = pickle.load(class_file)
#save.close()

In [None]:
print((nltk.classify.accuracy(classifier, testing_set))*100)

In [None]:
from nltk.classify.scikitlearn import SklearnClassifier

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [None]:
mnbclassifier = SklearnClassifier(MultinomialNB()).train(training_set)
print((nltk.classify.accuracy(mnbclassifier, testing_set))*100)

In [None]:
#gaussclassifier = SklearnClassifier(GaussianNB())
#gaussclassifier.train(training_set)
#print((nltk.classify.accuracy(gaussclassifier, testing_set))*100)

In [None]:
bernclassifier = SklearnClassifier(BernoulliNB())
bernclassifier.train(training_set)
print((nltk.classify.accuracy(bernclassifier, testing_set))*100)

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [None]:
logisticclassifier = SklearnClassifier(LogisticRegression()).train(training_set)
print((nltk.classify.accuracy(logisticclassifier, testing_set))*100)

In [None]:
sgdclassifier = SklearnClassifier(SGDClassifier()).train(training_set)
print((nltk.classify.accuracy(sgdclassifier, testing_set))*100)

In [None]:
svcclassifier = SklearnClassifier(SVC()).train(training_set)
print((nltk.classify.accuracy(svcclassifier, testing_set))*100)

In [None]:
linearsvcclassifier = SklearnClassifier(LinearSVC()).train(training_set)
print((nltk.classify.accuracy(linearsvcclassifier, testing_set))*100)

In [None]:
nusvcclassifier = SklearnClassifier(NuSVC()).train(training_set)
print((nltk.classify.accuracy(nusvcclassifier, testing_set))*100)

In [None]:
from nltk.classify import ClassifierI
from statistics import mode

In [None]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes/ len(votes)
        return conf    

In [None]:
voted_classifier = VoteClassifier(classifier,nusvcclassifier,linearsvcclassifier,svcclassifier,
                                 sgdclassifier,logisticclassifier,bernclassifier,mnbclassifier )

In [None]:
print((nltk.classify.accuracy(voted_classifier, testing_set))*100)

In [None]:
print(voted_classifier.classify(testing_set[0][0]), voted_classifier.confidence(testing_set[0][0]))

In [None]:
print(voted_classifier.classify(testing_set[5][0]), voted_classifier.confidence(testing_set[5][0]))

In [None]:
print(voted_classifier.classify(testing_set[10][0]), voted_classifier.confidence(testing_set[10][0]))

In [None]:
print(voted_classifier.classify(testing_set[6][0]), voted_classifier.confidence(testing_set[6][0]))