In [1]:
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
import random
import pickle
from statistics import mode
from nltk.classify import ClassifierI
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC

In [2]:
class VotedClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for classifier in self._classifiers:
            v = classifier.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for classifier in self._classifiers:
            v = classifier.classify(features)
            votes.append(v)
        majority = votes.count(mode(votes))
        return majority/len(votes)

In [3]:
positive = open('positive.txt', 'r').read()
negative = open('negative.txt', 'r').read()

documents = []
all_words = []
allowed_word_types = ['J']

for r in positive.split('\n'):
    documents.append((r, 'pos'))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
            
for r in negative.split('\n'):
    documents.append((r, 'neg'))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

In [4]:
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

feature_sets = [(find_features(review), category) for (review, category) in documents]
random.shuffle(feature_sets)

train_set = feature_sets[10000:]
test_set = feature_sets[:10000]

In [5]:
# positive = open('positive.txt', 'r').read()
# negative = open('negative.txt', 'r').read()

# documents = []
# for review in positive.split('\n'):
#     documents.append((review, 'pos'))

# for review in negative.split('\n'):
#     documents.append((review, 'neg'))

# all_words = []
# positive_words = word_tokenize(positive)
# negative_words = word_tokenize(negative)

# for word in positive_words:
#     all_words.append(word.lower())

# for word in negative_words:
#     all_words.append(word.lower())

# all_words = nltk.FreqDist(all_words)

In [6]:
# word_features = list(all_words.keys())[:5000]

# def find_features(document):
#     words = word_tokenize(document)
#     features = {}
#     for w in word_features:
#         features[w] = (w in words)
#     return features

# feature_sets = [(find_features(review), category) for (review, category) in documents]
# random.shuffle(feature_sets)

# train_set = feature_sets[10000:]
# test_set = feature_sets[:10000]

In [7]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(f"Original Naive Bayes Accuracy: {nltk.classify.accuracy(classifier, test_set)*100}")

Original Naive Bayes Accuracy: 62.12


In [8]:
mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(train_set)
print(f"Multinomial Naive Bayes Accuracy: {nltk.classify.accuracy(mnb_classifier, test_set)*100}")

Multinomial Naive Bayes Accuracy: 62.08


In [9]:
bnb_classifier = SklearnClassifier(BernoulliNB())
bnb_classifier.train(train_set)
print(f"Bernoulli Naive Bayes Accuracy: {nltk.classify.accuracy(bnb_classifier, test_set)*100}")

Bernoulli Naive Bayes Accuracy: 58.39


In [10]:
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(train_set)
print(f"Logistic Regression Accuracy: {nltk.classify.accuracy(lr_classifier, test_set)*100}")

Logistic Regression Accuracy: 61.870000000000005


In [11]:
sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(train_set)
print(f"SGD Classifier Accuracy: {nltk.classify.accuracy(sgd_classifier, test_set)*100}")

SGD Classifier Accuracy: 59.01


In [12]:
svm_classifier = SklearnClassifier(LinearSVC())
svm_classifier.train(train_set)
print(f"Linear SVC Accuracy: {nltk.classify.accuracy(svm_classifier, test_set)*100}")

Linear SVC Accuracy: 59.93000000000001


In [13]:
voted_classifier = VotedClassifier(classifier,
                                   mnb_classifier,
                                   bnb_classifier,
                                   lr_classifier,
                                   svm_classifier)

print(f"Voted Classifier Accuracy: {nltk.classify.accuracy(voted_classifier, test_set)*100}")

Voted Classifier Accuracy: 61.809999999999995


In [14]:
save_docs = open('documents.pickle', 'wb')
pickle.dump(documents, save_docs)
save_docs.close()

In [15]:
save_features = open('features.pickle', 'wb')
pickle.dump(word_features, save_features)
save_features.close()

In [16]:
save_features = open('feature_sets.pickle', 'wb')
pickle.dump(feature_sets, save_features)
save_features.close()

In [17]:
save_classifier = open('original_nb.pickle', 'wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [18]:
save_classifier = open('mnb_classifier.pickle', 'wb')
pickle.dump(mnb_classifier, save_classifier)
save_classifier.close()

In [19]:
save_classifier = open('bnb_classifier.pickle', 'wb')
pickle.dump(bnb_classifier, save_classifier)
save_classifier.close()

In [20]:
save_classifier = open('lr_classifier.pickle', 'wb')
pickle.dump(lr_classifier, save_classifier)
save_classifier.close()

In [21]:
save_classifier = open('sgd_classifier.pickle', 'wb')
pickle.dump(sgd_classifier, save_classifier)
save_classifier.close()

In [22]:
save_classifier = open('svm_classifier.pickle', 'wb')
pickle.dump(svm_classifier, save_classifier)
save_classifier.close()