In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids():
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [3]:
import random
random.shuffle(documents)
documents[0:10]

[(['steve', 'martin', 'is', 'one', 'of', 'the', ...], 'neg'),
 (['"', 'lake', 'placid', '"', 'marks', 'yet', 'another', ...], 'neg'),
 (['my', 'son', 'and', 'i', 'share', 'a', 'perverse', ...], 'neg'),
 (['everyone', 'knows', 'that', 'old', 'rule', ',', "'", ...], 'pos'),
 (['alien', '3', 'is', 'the', 'only', 'alien', 'film', ...], 'neg'),
 (['"', 'you', 'can', "'", 't', 'have', 'any', 'of', ...], 'pos'),
 (['plot', ':', 'a', 'group', 'of', 'asbestos', ...], 'neg'),
 (['"', 'there', 'will', 'be', 'another', ',', '"', ...], 'neg'),
 (['we', "'", 're', 'back', 'in', 'blade', 'runner', ...], 'neg'),
 (['a', 'big', ',', 'busy', 'boxing', 'satire', 'with', ...], 'neg')]

In [4]:
from nltk import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_simple_pos(tag):
    if tag.startswith('J' or 'R'):
        return 'a'
    elif tag.startswith('S'):
        return 'n'
    elif tag.startswith('V'):
        return 'v'
    else:
        return 'v'


In [5]:
stops = stopwords.words('english')
import string
punctuations = list(string.punctuation)
stops += punctuations
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
from nltk import pos_tag
def clean_reviews(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [7]:
documents = [(clean_reviews(document), category) for document, category in documents]

In [8]:
documents[0]

(['steve',
  'martin',
  'one',
  'funniest',
  'men',
  'alive',
  'take',
  'true',
  'statement',
  'disappointment',
  'film',
  'equal',
  'mine',
  'martin',
  'hilarious',
  'create',
  'best',
  'laugh',
  'loud',
  'experience',
  'ever',
  'take',
  'place',
  'movie',
  'theaters',
  'find',
  'old',
  'television',
  'series',
  'base',
  'moments',
  'humor',
  'wit',
  'bilko',
  'name',
  'accident',
  'head',
  'army',
  'motor',
  'pool',
  'group',
  'passion',
  'scheme',
  'every',
  'episode',
  'involve',
  'sergeant',
  'men',
  'one',
  'another',
  'hair',
  'brain',
  'plan',
  'get',
  'rich',
  'quick',
  'outwit',
  'officer',
  'base',
  'mchale',
  'navy',
  'granddaddy',
  'idea',
  'behind',
  'movie',
  'difference',
  'far',
  'fetch',
  'usually',
  'goofy',
  'television',
  'series',
  'funny',
  'one',
  'laugh',
  'film',
  'make',
  'retain',
  'goofiness',
  'entertainment',
  'everything',
  'clean',
  'obviously',
  'make',
  'hollywood',
  '

In [37]:
#split the data into training and testing parts
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [38]:
#forming an array of all the words present in our training data -> all_words
all_words = []  
for doc in training_documents:
    all_words += doc[0]
all_words

['steve',
 'martin',
 'one',
 'funniest',
 'men',
 'alive',
 'take',
 'true',
 'statement',
 'disappointment',
 'film',
 'equal',
 'mine',
 'martin',
 'hilarious',
 'create',
 'best',
 'laugh',
 'loud',
 'experience',
 'ever',
 'take',
 'place',
 'movie',
 'theaters',
 'find',
 'old',
 'television',
 'series',
 'base',
 'moments',
 'humor',
 'wit',
 'bilko',
 'name',
 'accident',
 'head',
 'army',
 'motor',
 'pool',
 'group',
 'passion',
 'scheme',
 'every',
 'episode',
 'involve',
 'sergeant',
 'men',
 'one',
 'another',
 'hair',
 'brain',
 'plan',
 'get',
 'rich',
 'quick',
 'outwit',
 'officer',
 'base',
 'mchale',
 'navy',
 'granddaddy',
 'idea',
 'behind',
 'movie',
 'difference',
 'far',
 'fetch',
 'usually',
 'goofy',
 'television',
 'series',
 'funny',
 'one',
 'laugh',
 'film',
 'make',
 'retain',
 'goofiness',
 'entertainment',
 'everything',
 'clean',
 'obviously',
 'make',
 'hollywood',
 'back',
 'lot',
 'look',
 'every',
 'bite',
 'like',
 'look',
 'brand',
 'new',
 'even'

In [75]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(5000) # top 3000 features
print(common[0:20])
features = [i[0] for i in common] # as common is an array of tuples and we only want to add the words
features

[('film', 8073), ('one', 4275), ('movie', 4203), ('make', 3174), ('like', 2897), ('character', 2792), ('get', 2749), ('see', 2200), ('go', 2197), ('time', 2154), ('even', 1856), ('good', 1770), ('play', 1711), ('would', 1617), ('take', 1608), ('much', 1530), ('story', 1480), ('come', 1459), ('also', 1436), ('know', 1426)]


['film',
 'one',
 'movie',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'even',
 'good',
 'play',
 'would',
 'take',
 'much',
 'story',
 'come',
 'also',
 'know',
 'seem',
 '--',
 'well',
 'bad',
 'first',
 'look',
 'give',
 'two',
 'end',
 'work',
 'way',
 'plot',
 'find',
 'life',
 'little',
 'say',
 'think',
 'really',
 'show',
 'could',
 'people',
 'man',
 'love',
 'star',
 'best',
 'never',
 'scene',
 'great',
 'try',
 'new',
 'become',
 'big',
 'director',
 'many',
 'movies',
 'watch',
 'action',
 'scenes',
 'use',
 'back',
 'want',
 'act',
 'turn',
 'another',
 'set',
 'something',
 'still',
 'world',
 'us',
 'however',
 'leave',
 'live',
 'old',
 'guy',
 'enough',
 'every',
 'feel',
 'part',
 'year',
 'run',
 'audience',
 'cast',
 'around',
 'better',
 'funny',
 'real',
 'young',
 'write',
 'begin',
 'interest',
 'point',
 'name',
 'long',
 'last',
 'performance',
 'role',
 'lot',
 'may',
 'though',
 'things',
 'comedy',
 'tell',
 'nothing',
 'right',
 'scre

In [57]:
# a function for getting the dictionary of the words and their occurance
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features: #features is an array containing the top 3000 words
        current_features[w] = w in words_set
    return current_features

In [76]:
get_feature_dict(documents[0][0]) # we get the dictionary for the first document

{'film': True,
 'one': True,
 'movie': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'see': False,
 'go': False,
 'time': False,
 'even': True,
 'good': False,
 'play': True,
 'would': False,
 'take': True,
 'much': True,
 'story': False,
 'come': False,
 'also': False,
 'know': False,
 'seem': False,
 '--': False,
 'well': False,
 'bad': False,
 'first': False,
 'look': True,
 'give': False,
 'two': False,
 'end': False,
 'work': False,
 'way': False,
 'plot': False,
 'find': True,
 'life': True,
 'little': False,
 'say': True,
 'think': False,
 'really': False,
 'show': False,
 'could': False,
 'people': False,
 'man': False,
 'love': False,
 'star': False,
 'best': True,
 'never': False,
 'scene': False,
 'great': False,
 'try': False,
 'new': True,
 'become': False,
 'big': True,
 'director': False,
 'many': False,
 'movies': False,
 'watch': False,
 'action': False,
 'scenes': False,
 'use': False,
 'back': True,
 'want': False,
 'act': False,
 'turn': Fal

In [81]:
training_data = [(get_feature_dict(document), category) for document, category in training_documents]
testing_data = [(get_feature_dict(document), category) for document, category in training_documents]
training_data[1]

({'film': True,
  'one': True,
  'movie': True,
  'make': True,
  'like': True,
  'character': True,
  'get': False,
  'see': False,
  'go': True,
  'time': True,
  'even': True,
  'good': False,
  'play': True,
  'would': True,
  'take': False,
  'much': True,
  'story': False,
  'come': False,
  'also': True,
  'know': True,
  'seem': True,
  '--': False,
  'well': True,
  'bad': False,
  'first': False,
  'look': False,
  'give': True,
  'two': True,
  'end': True,
  'work': False,
  'way': True,
  'plot': True,
  'find': False,
  'life': False,
  'little': True,
  'say': False,
  'think': True,
  'really': True,
  'show': False,
  'could': False,
  'people': False,
  'man': False,
  'love': False,
  'star': False,
  'best': True,
  'never': False,
  'scene': False,
  'great': False,
  'try': False,
  'new': False,
  'become': False,
  'big': False,
  'director': True,
  'many': False,
  'movies': False,
  'watch': False,
  'action': False,
  'scenes': False,
  'use': False,
  'back

In [82]:
#training the data using naiveBayesClassifier
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [83]:
accuracy = nltk.classify.accuracy(classifier, testing_data)
accuracy

0.792

In [84]:
classifier.show_most_informative_features(15)

Most Informative Features
                   byrne = True              neg : pos    =      5.4 : 1.0
                   jacob = True              neg : pos    =      4.8 : 1.0
                   tango = True              neg : pos    =      4.2 : 1.0
                    luis = True              neg : pos    =      4.2 : 1.0
                  cooper = True              pos : neg    =      4.0 : 1.0
                    hoot = True              pos : neg    =      4.0 : 1.0
                    _the = True              pos : neg    =      3.8 : 1.0
                  ripley = True              pos : neg    =      3.8 : 1.0
                  bening = True              pos : neg    =      3.8 : 1.0
                  picard = True              pos : neg    =      3.8 : 1.0
                 factory = True              neg : pos    =      3.6 : 1.0
                      li = True              neg : pos    =      3.6 : 1.0
           uncomfortable = True              neg : pos    =      3.5 : 1.0