In [4]:
#Import libraries & movie_review data
import nltk
from nltk.corpus import movie_reviews
import random

In [6]:
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [8]:
#This dataset contains 2000 movie reviews manually labeled by `pos' (positive review) or
`neg' (negative review).labels = [x[1] for x in documents]
print ("Number of positive reviews: " +
        str(len([x for x in labels if x == 'pos'])))
print ("Number of negative reviews: " +
        str(len([x for x in labels if x == 'neg'])))

Number of positive reviews: 1000
Number of negative reviews: 1000


In [9]:
#Now, We choose the 2000 most frequent words and define the function that generates binary
#feature vector x based on these 2000 words for each document.
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [10]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [12]:
from itertools import islice
feature = document_features(['batmans', 'inquires', 'rags'])
for key in islice(feature, 10):
    print( key + ': ' + str(feature[key]))

contains(plot): False
contains(:): False
contains(two): False
contains(teen): False
contains(couples): False
contains(go): False
contains(to): False
contains(a): False
contains(church): False
contains(party): False


In [14]:
#Now split the data into train and test sets. Lets allocate 1900 documents for training set 
#and 100 documents to the test set.
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
print("Size of train set is: " + str(len(train_set)))
print("Size of test set is: " + str(len(test_set)))

Size of train set is: 1900
Size of test set is: 100


In [15]:
#Training the model using Naive Bayes and calculating the accuracy
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Accuracy: " + str(nltk.classify.accuracy(classifier, test_set)))


Accuracy: 0.83


In [16]:
#We can also get the most informative features
classifier.show_most_informative_features(10)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      8.3 : 1.0
          contains(mena) = True              neg : pos    =      7.0 : 1.0
     contains(atrocious) = True              neg : pos    =      7.0 : 1.0
        contains(suvari) = True              neg : pos    =      7.0 : 1.0
    contains(schumacher) = True              neg : pos    =      7.0 : 1.0
        contains(turkey) = True              neg : pos    =      6.5 : 1.0
       contains(singers) = True              pos : neg    =      6.4 : 1.0
        contains(shoddy) = True              neg : pos    =      6.3 : 1.0
        contains(justin) = True              neg : pos    =      5.8 : 1.0
           contains(ugh) = True              neg : pos    =      5.8 : 1.0
