In [12]:
import nltk
import random
from nltk.corpus import movie_reviews

In [13]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [14]:
random.shuffle(documents)

In [15]:
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

In [16]:
all_words = nltk.FreqDist(all_words)

In [17]:
word_features = list(all_words.keys())[:3000]

In [18]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [19]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [20]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [21]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [22]:
# First we just simply are invoking the Naive Bayes classifier, then we go ahead
# and use .train() to train it all in one line.

# Easy enough, now it is trained. Next, we can test it:

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)


Classifier accuracy percent: 76.0


In [23]:
# Next, we can take it a step further to see what the most valuable words are 
# when it comes to positive or negative reviews:

classifier.show_most_informative_features(15)

Most Informative Features
                 idiotic = True              neg : pos    =     13.1 : 1.0
                   sucks = True              neg : pos    =     10.7 : 1.0
                  justin = True              neg : pos    =      9.1 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                  alicia = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
              schumacher = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                  turkey = True              neg : pos    =      6.8 : 1.0
               atrocious = True              neg : pos    =      6.7 : 1.0