In [1]:
import nltk
from nltk import word_tokenize
import random
from nltk.corpus import movie_reviews
import pickle



In [2]:
documents = [(list(movie_reviews.words(fileid)), category) 
             for category in movie_reviews.categories() 
             for fileid in movie_reviews.fileids(category)]

In [3]:
random.shuffle(documents)

In [4]:
documents[0][1]

'neg'

In [5]:
all_words = [w.lower() for w in movie_reviews.words()]

In [6]:
print(len(all_words))
all_words = nltk.FreqDist(all_words)
print(len(all_words))

1583820
39768


In [7]:
all_words.most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [8]:
word_features = list(all_words.keys())[:3000]

In [9]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features

In [10]:
find_features(movie_reviews.words("neg/cv000_29416.txt"))
featureset = [(find_features(rev), category) for (rev, category) in documents]

In [11]:
training_set = featureset[:1900]
testing_set = featureset[1900:]

In [12]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [13]:
print("Accuracy: ",nltk.classify.accuracy(classifier, testing_set)*100)

Accuracy:  83.0


In [14]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =     16.4 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                 frances = True              pos : neg    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                    mena = True              neg : pos    =      6.4 : 1.0
               pregnancy = True              neg : pos    =      6.4 : 1.0
                  shoddy = True              neg : pos    =      6.4 : 1.0
                  suvari = True              neg : pos    =      6.4 : 1.0
                 cunning = True              pos : neg    =      6.3 : 1.0

In [15]:
save_classifier = open("models/naivebayes.pickle", mode="wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [16]:
classifier_f = open("models/naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
print("Accuracy: ",nltk.classify.accuracy(classifier, testing_set)*100)

Accuracy:  83.0


In [17]:
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB


In [18]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(MNB_classifier, testing_set) * 100)

Accuracy:  82.0


In [19]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(BernoulliNB_classifier, testing_set) * 100)

Accuracy:  82.0


In [20]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [21]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(max_iter=10000))
LogisticRegression_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(LogisticRegression_classifier, testing_set) * 100)

Accuracy:  81.0


In [22]:
"""
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(SGDClassifier_classifier, testing_set) * 100)
"""

'\nSGDClassifier_classifier = SklearnClassifier(SGDClassifier())\nSGDClassifier_classifier.train(training_set)\nprint("Accuracy: ", nltk.classify.accuracy(SGDClassifier_classifier, testing_set) * 100)\n'

In [23]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(SVC_classifier, testing_set) * 100)

Accuracy:  83.0


In [24]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(LinearSVC_classifier, testing_set) * 100)

Accuracy:  80.0


In [25]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(NuSVC_classifier, testing_set) * 100)

Accuracy:  83.0


In [26]:
from nltk.classify import ClassifierI
from statistics import mode

In [27]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        
        for c in self._classifiers:
            
            v = c.classify(features)
            votes.append(v)
            
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            
        choice_votes = votes.count(mode(votes))
        conf = choice_votes/len(votes)
        return conf
        

In [28]:
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, 
                                    SVC_classifier, LinearSVC_classifier, NuSVC_classifier)

In [29]:
print(voted_classifier.classify(testing_set[0][0]), voted_classifier.confidence(testing_set[0][0]))
print(voted_classifier.classify(testing_set[1][0]), voted_classifier.confidence(testing_set[1][0]))
print(voted_classifier.classify(testing_set[2][0]), voted_classifier.confidence(testing_set[2][0]))
print(voted_classifier.classify(testing_set[3][0]), voted_classifier.confidence(testing_set[3][0]))
print(voted_classifier.classify(testing_set[4][0]), voted_classifier.confidence(testing_set[4][0]))

neg 0.7142857142857143
pos 1.0
neg 1.0
pos 1.0
neg 1.0


In [30]:
print("Accuracy: ", nltk.classify.accuracy(voted_classifier, testing_set))

Accuracy:  0.85


In [31]:
print(testing_set[0][1])
print(testing_set[1][1])
print(testing_set[2][1])
print(testing_set[3][1])
print(testing_set[4][1])

neg
pos
pos
pos
neg


In [32]:
import io

In [33]:
short_pos = open("short_reviews/positive.txt","r").read()
short_neg = open("short_reviews/negative.txt","r").read()

In [34]:
documents = []

for i in short_pos.split("\n"):
    documents.append((i,"pos"))
    
for i in short_neg.split("\n"):
    documents.append((i,"neg"))
    
len(documents)

10664

In [35]:
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

In [36]:
all_words = [i.lower() for i in short_pos_words]
all_words = all_words + [i.lower() for i in short_neg_words]

In [37]:
all_words = nltk.FreqDist(all_words)

In [38]:
word_features = list(all_words.keys())[:5000]

In [39]:
word_features[1]

'rock'

In [40]:
def find_features(documents):
    words = word_tokenize(documents)
    features = {}
    for i in words:
        features[i] = (i in word_features)
        
    return features

In [41]:
featureset = [(find_features(rev), category) for (rev, category) in documents]

In [42]:
training_set = featureset[:10000]
testing_set = featureset[10000:]

In [43]:
len(documents)

10664

In [44]:
 len(featureset)

10664

In [45]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [46]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(MNB_classifier, testing_set) * 100)

Accuracy:  72.28915662650603


In [47]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(BernoulliNB_classifier, testing_set) * 100)

Accuracy:  78.3132530120482


In [48]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [49]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(max_iter=10000))
LogisticRegression_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(LogisticRegression_classifier, testing_set) * 100)

Accuracy:  78.01204819277109


In [50]:
"""
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(SGDClassifier_classifier, testing_set) * 100)
"""

'\nSGDClassifier_classifier = SklearnClassifier(SGDClassifier())\nSGDClassifier_classifier.train(training_set)\nprint("Accuracy: ", nltk.classify.accuracy(SGDClassifier_classifier, testing_set) * 100)\n'

In [51]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(SVC_classifier, testing_set) * 100)

Accuracy:  72.89156626506023


In [52]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(LinearSVC_classifier, testing_set) * 100)

Accuracy:  76.95783132530121


In [53]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(NuSVC_classifier, testing_set) * 100)

Accuracy:  76.05421686746988


In [54]:
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, 
                                    SVC_classifier, LinearSVC_classifier, NuSVC_classifier)

In [55]:
print(voted_classifier.classify(testing_set[0][0]), voted_classifier.confidence(testing_set[0][0]))
print(voted_classifier.classify(testing_set[1][0]), voted_classifier.confidence(testing_set[1][0]))
print(voted_classifier.classify(testing_set[2][0]), voted_classifier.confidence(testing_set[2][0]))
print(voted_classifier.classify(testing_set[3][0]), voted_classifier.confidence(testing_set[3][0]))
print(voted_classifier.classify(testing_set[4][0]), voted_classifier.confidence(testing_set[4][0]))

pos 1.0
neg 0.7142857142857143
neg 1.0
pos 1.0
neg 0.5714285714285714


In [56]:
save_classifier = open("models/voted_classifier_naivebayes.pickle", mode="wb")
pickle.dump(voted_classifier, save_classifier)
save_classifier.close()

In [54]:
documents = []
all_words = []

In [50]:
allowed_word_types = ["J"]

for p in short_pos.split("\n"):
    documents.append((p,"pos"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

In [51]:
allowed_word_types = ["J"]

for p in short_neg.split("\n"):
    documents.append((p,"neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

In [66]:
def sentiments(text):
    fets = find_features(text)
    
    return voted_classifier.classify(fets)

In [53]:
nltk.pos_tag(all_words[:10])

[('21st', 'CD'),
 ('new', 'JJ'),
 ('conan', 'NN'),
 ('greater', 'JJR'),
 ('jean-claud', 'NN'),
 ('steven', 'NN'),
 ('elaborate', 'VBP'),
 ('huge', 'JJ'),
 ('expanded', 'VBD'),
 ('effective', 'JJ')]