## Movie reviews corpus from nltk corpora

In [3]:
import nltk

from nltk.corpus import movie_reviews

print movie_reviews.readme()

Sentiment Polarity Dataset Version 2.0
Bo Pang and Lillian Lee

http://www.cs.cornell.edu/people/pabo/movie-review-data/

Distributed with NLTK with permission from the authors.


Introduction

This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from
the URL http://www.cs.cornell.edu/people/pabo/movie-review-data .


What's New -- June, 2004

This dataset represents an enhancement of the review corpus v1.0
described in README v1.1: it contains more reviews, and labels were
created with an improved rating-extraction system.


Citation Info 

This data was first used in Bo Pang and Lillian Lee,
``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'',  Proceedings of the ACL, 2004.

@InProceedings{Pang+Lee:04a,
  author =       {Bo Pang and Lillian Lee},
  title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
  booktitle =    "Proceedings of the ACL",
  year =      

## Overview of corpus and files

In [4]:
movie_reviews.categories()

[u'neg', u'pos']

In [6]:
files = movie_reviews.fileids()
files[1:6]

[u'neg/cv001_19502.txt',
 u'neg/cv002_17424.txt',
 u'neg/cv003_12683.txt',
 u'neg/cv004_12641.txt',
 u'neg/cv005_29357.txt']

In [8]:
files[20]

u'neg/cv020_9234.txt'

In [9]:
movie_reviews.categories(files[20])

[u'neg']

In [12]:
bad_movies = movie_reviews.fileids('neg')
len(bad_movies)

1000

In [13]:
good_movies = movie_reviews.fileids('pos')
len(good_movies)

1000

## Contents of documents

In [14]:
doc = movie_reviews.words(files[1])

In [15]:
print " ".join(doc)

the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . we don ' t know why the crew was really out in the middle of nowhere , we don ' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don ' t know why donald sutherland is stumbling around drunkenly throughout . here , it ' s just " hey , let ' s chase these people around with some robots " . the acting is below average , even from the likes of curtis . you ' re more likely to get a kick out of her work i

## Corpus freq. distribution

In [17]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
most = all_words.most_common() 
most[1:10]

[(u'the', 76529),
 (u'.', 65876),
 (u'a', 38106),
 (u'and', 35576),
 (u'of', 34123),
 (u'to', 31937),
 (u"'", 30585),
 (u'is', 25195),
 (u'in', 21822)]

## Using a naive bayes classifier

In [20]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier

In [22]:
# Feature vector as words - Binary map
def word_feats(words):
    return dict([(word, True) for word in words])
word_feats(['How', 'are', 'you','?'])

{'?': True, 'How': True, 'are': True, 'you': True}

## Document representation as feature vectors (binary maps)

In [24]:
# Document ids per category
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [25]:
# Feature vectors
negfeatures = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') 
               for f in negids]
posfeatures = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') 
               for f in posids]

In [31]:
# Train a test sets:
negcutoff = len(negfeatures)*3/4
poscutoff = len(posfeatures)*3/4

trainfeats = negfeatures[:negcutoff] + posfeatures[:poscutoff]
testfeats = negfeatures[negcutoff:] + posfeatures[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), 
                                                       len(testfeats))

train on 1500 instances, test on 500 instances


In [34]:
# print trainfeats[24]

## Train the classifier

In [36]:
classifier = NaiveBayesClassifier.train(trainfeats)

## Classification evaluation

In [38]:
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)

accuracy: 0.728


## Analysis of the model

In [41]:
classifier.show_most_informative_features(20)

Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
               affecting = True              pos : neg    =      9.7 : 1.0
                  symbol = True              pos : neg    =      9.7 : 1.0
               animators = True              pos : neg    =      9.0 : 1.0

In [51]:
import collections

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print 'pos precision:', precision(refsets['pos'], testsets['pos'])
print 'pos recall:', recall(refsets['pos'], testsets['pos'])
print 'pos F-measure:', f_measure(refsets['pos'], testsets['pos'])
print 'neg precision:', precision(refsets['neg'], testsets['neg'])
print 'neg recall:', recall(refsets['neg'], testsets['neg'])
print 'neg F-measure:', f_measure(refsets['neg'], testsets['neg'])

pos precision: 0.651595744681
pos recall: 0.98
pos F-measure: 0.782747603834
neg precision: 0.959677419355
neg recall: 0.476
neg F-measure: 0.636363636364
