In [33]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random

In [34]:
def create_features(words):
    useful_words=[word for word in words if word not in stopwords.words('english')]
    feature_words=dict([(word,True)for word in useful_words])
    return feature_words

neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words=movie_reviews.words(fileid)
    neg_reviews.append((create_features(words), "negative"))
pos_reviews = []
for fileis in movie_reviews.fileids('pos'):
    words=movie_reviews.words(fileid)
    pos_reviews.append((create_features(words), "positive"))


In [12]:
#first tokenized negative review 
print(neg_reviews[0])
#first tokenized positive review
print(pos_reviews[0])

({'two': True, 'party': True, 'guys': True, 'bob': True, 'heads': True, 'haddaway': True, "'": True, 'dance': True, 'hit': True, '"': True, 'love': True, '?': True, 'getting': True, 'trouble': True, 'nightclub': True, '.': True, 'barely': True, 'enough': True, 'sustain': True, 'three': True, '-': True, 'minute': True, '_saturday_night_live_': True, 'skit': True, ',': True, '_snl_': True, 'producer': True, 'lorne': True, 'michaels': True, '_clueless_': True, 'creator': True, 'amy': True, 'heckerling': True, 'paramount': True, 'pictures': True, 'saw': True, 'something': True, 'late': True, 'night': True, 'television': True, 'institution': True, 'recurring': True, 'roxbury': True, 'sketch': True, 'would': True, 'presumably': True, 'make': True, 'good': True, 'feature': True, 'emphasis': True, 'word': True, '_a_night_at_the_roxbury_': True, 'takes': True, 'already': True, 'thin': True, 'concept': True, 'tediously': True, 'stretches': True, 'far': True, 'beyond': True, 'breaking': True, 'po

In [46]:
#training set is first 750 reviews from both categories
train_set = neg_reviews[:900] + pos_reviews[:900]
#testing set is rest 250 reviews from both categories
test_set =  neg_reviews[900:] + pos_reviews[900:]
#shuffling to avoid bias


In [47]:
#Naive Bayes Classifier
classifier=NaiveBayesClassifier.train(train_set)
accuracy=nltk.classify.util.accuracy(classifier,test_set)


In [48]:
accuracy

0.995

In [50]:
review_spirit = '''-'''
featurized_doc = {c:True for c in review_spirit.split()}
print(featurized_doc)
classifier.classify(featurized_doc)
#words = word_tokenize(review_spirit)
#words = create_word_features(words)
#classifier.classify(words)


{'Spirited': True, "Away'": True, 'is': True, 'the': True, 'first': True, 'Miyazaki': True, 'I': True, 'have': True, 'seen,': True, 'but': True, 'from': True, 'this': True, 'stupendous': True, 'film': True, 'can': True, 'tell': True, 'he': True, 'a': True, 'master': True, 'storyteller.': True, 'A': True, 'hallmark': True, 'of': True, 'good': True, 'storyteller': True, 'making': True, 'audience': True, 'empathise': True, 'or': True, 'pull': True, 'them': True, 'into': True, 'shoes': True, 'central': True, 'character.': True, 'does': True, 'brilliantly': True, 'in': True, "'Spirited": True, "Away'.": True, 'During': True, 'fifteen': True, 'minutes': True, 'we': True, 'no': True, 'idea': True, 'what': True, 'going': True, 'on.': True, 'Neither': True, 'main': True, 'character': True, 'Chihiro.': True, 'We': True, 'discover': True, 'world': True, 'as': True, 'Chihiro': True, 'and': True, "it's": True, 'truly': True, 'amazing': True, 'to': True, 'watch.': True, 'But': True, "doesn't": True,

'negative'

In [1]:
import nltk
import random
import pickle
from nltk.corpus import movie_reviews
from os.path import exists
from nltk.classify import apply_features
from nltk.tokenize import word_tokenize, sent_tokenize

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]



In [2]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())
#print(word_features)

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features



In [3]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]
numtrain = int(len(documents) * 90 / 100)
training_set = apply_features(find_features, documents[:numtrain])
testing_set = apply_features(find_features, documents[numtrain:])

classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features(15)

Most Informative Features
               insulting = True              neg : pos    =     14.7 : 1.0
                  avoids = True              pos : neg    =     14.6 : 1.0
              astounding = True              pos : neg    =     14.6 : 1.0
             fascination = True              pos : neg    =     13.7 : 1.0
               ludicrous = True              neg : pos    =     12.3 : 1.0
                   mulan = True              pos : neg    =     11.2 : 1.0
                  annual = True              pos : neg    =     11.2 : 1.0
                  hatred = True              pos : neg    =     11.2 : 1.0
             outstanding = True              pos : neg    =     11.0 : 1.0
             uninvolving = True              neg : pos    =     10.9 : 1.0
                  seagal = True              neg : pos    =     10.9 : 1.0
                 idiotic = True              neg : pos    =     10.7 : 1.0
                 studies = True              pos : neg    =     10.4 : 1.0

In [None]:
-