In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids():
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [3]:
import random
random.shuffle(documents)
documents[0:10]

[(['"', 'idle', 'hands', '"', 'is', 'distasteful', ',', ...], 'neg'),
 (['synopsis', ':', 'melissa', ',', 'a', 'mentally', '-', ...], 'pos'),
 (['stallone', 'attempts', 'to', "'", 'act', "'", 'in', ...], 'neg'),
 (['hey', ',', 'i', "'", 've', 'got', 'a', 'great', ...], 'pos'),
 (['natural', 'born', 'killers', 'is', 'really', 'a', ...], 'neg'),
 (['review', ':', 'ghost', 'dog', ':', 'the', 'way', ...], 'neg'),
 (['let', "'", 's', 'say', 'you', 'live', 'at', 'the', ...], 'pos'),
 (['"', 'oh', 'my', 'god', ',', 'i', 'sounded', 'just', ...], 'pos'),
 (['this', 'movie', 'is', 'based', 'off', 'the', ...], 'neg'),
 (['this', 'is', 'the', 'worst', 'movie', 'i', "'", 've', ...], 'pos')]

In [4]:
from nltk import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_simple_pos(tag):
    if tag.startswith('J' or 'R'):
        return 'a'
    elif tag.startswith('S'):
        return 'n'
    elif tag.startswith('V'):
        return 'v'
    else:
        return 'v'


In [5]:
stops = stopwords.words('english')
import string
punctuations = list(string.punctuation)
stops += punctuations
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
from nltk import pos_tag
def clean_reviews(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [7]:
documents = [(clean_reviews(document), category) for document, category in documents]

In [8]:
documents[0]

(['idle',
  'hand',
  'distasteful',
  'crass',
  'derivative',
  'original',
  'think',
  'find',
  'way',
  'horror',
  'comedy',
  'would',
  'die',
  'loneliness',
  'plus',
  'question',
  'judgment',
  'sensitivity',
  'studio',
  'executives',
  'would',
  'green',
  'light',
  'release',
  'movie',
  'deal',
  'slaughter',
  'innocent',
  'teen',
  'agers',
  'week',
  'tragedy',
  'littleton',
  'colo',
  'movie',
  'insult',
  'horror',
  'film',
  'fan',
  'teen',
  'agers',
  'plot',
  'little',
  'deal',
  'anton',
  'devon',
  'sawa',
  'high',
  'school',
  'slacker',
  'hand',
  'go',
  'murderous',
  'rampage',
  'become',
  'possess',
  'along',
  'way',
  'hand',
  'kill',
  'anton',
  'parent',
  'two',
  'best',
  'friends',
  'friends',
  'also',
  'slackers',
  'lazy',
  'return',
  'dead',
  'walk',
  'light',
  'far',
  'level',
  'atrocity',
  'humor',
  'movie',
  'treat',
  'death',
  'joke',
  'idle',
  'hand',
  'gruesome',
  'morbid',
  'performances',
  

In [9]:
#split the data into training and testing parts
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [10]:
#forming an array of all the words present in our training data -> all_words
all_words = []  
for doc in training_documents:
    all_words += doc[0]
all_words

['idle',
 'hand',
 'distasteful',
 'crass',
 'derivative',
 'original',
 'think',
 'find',
 'way',
 'horror',
 'comedy',
 'would',
 'die',
 'loneliness',
 'plus',
 'question',
 'judgment',
 'sensitivity',
 'studio',
 'executives',
 'would',
 'green',
 'light',
 'release',
 'movie',
 'deal',
 'slaughter',
 'innocent',
 'teen',
 'agers',
 'week',
 'tragedy',
 'littleton',
 'colo',
 'movie',
 'insult',
 'horror',
 'film',
 'fan',
 'teen',
 'agers',
 'plot',
 'little',
 'deal',
 'anton',
 'devon',
 'sawa',
 'high',
 'school',
 'slacker',
 'hand',
 'go',
 'murderous',
 'rampage',
 'become',
 'possess',
 'along',
 'way',
 'hand',
 'kill',
 'anton',
 'parent',
 'two',
 'best',
 'friends',
 'friends',
 'also',
 'slackers',
 'lazy',
 'return',
 'dead',
 'walk',
 'light',
 'far',
 'level',
 'atrocity',
 'humor',
 'movie',
 'treat',
 'death',
 'joke',
 'idle',
 'hand',
 'gruesome',
 'morbid',
 'performances',
 'stereotypical',
 'cartoonish',
 'realize',
 'suppose',
 'pass',
 'comedy',
 'recent',


In [11]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(5000) # top 3000 features
print(common[0:20])
features = [i[0] for i in common] # as common is an array of tuples and we only want to add the words
features

[('film', 8291), ('movie', 4412), ('one', 4343), ('make', 3208), ('like', 2965), ('character', 2871), ('get', 2870), ('see', 2444), ('go', 2385), ('time', 2225), ('even', 1961), ('play', 1761), ('good', 1752), ('take', 1676), ('story', 1635), ('would', 1536), ('well', 1533), ('much', 1514), ('know', 1501), ('two', 1475)]


['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'even',
 'play',
 'good',
 'take',
 'story',
 'would',
 'well',
 'much',
 'know',
 'two',
 'bad',
 'come',
 'first',
 'also',
 'look',
 'give',
 'seem',
 'end',
 'way',
 'work',
 '--',
 'plot',
 'really',
 'say',
 'life',
 'think',
 'find',
 'show',
 'people',
 'little',
 'star',
 'love',
 'man',
 'scene',
 'best',
 'could',
 'try',
 'never',
 'action',
 'new',
 'great',
 'big',
 'scenes',
 'want',
 'watch',
 'many',
 'become',
 'director',
 'movies',
 'act',
 'use',
 'another',
 'world',
 'us',
 'back',
 'something',
 'feel',
 'turn',
 'still',
 'every',
 'leave',
 'however',
 'though',
 'set',
 'begin',
 'old',
 'part',
 'run',
 'better',
 'tell',
 'enough',
 'cast',
 'point',
 'young',
 'live',
 'audience',
 'performance',
 'right',
 'interest',
 'around',
 'long',
 'guy',
 'things',
 'write',
 'actually',
 'may',
 'script',
 'ever',
 'real',
 'years',
 'john',
 'year',
 'last',
 'funny',
 'l

In [12]:
# a function for getting the dictionary of the words and their occurance
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features: #features is an array containing the top 3000 words
        current_features[w] = w in words_set
    return current_features

In [13]:
get_feature_dict(documents[0][0]) # we get the dictionary for the first document

{'film': True,
 'movie': True,
 'one': False,
 'make': True,
 'like': False,
 'character': False,
 'get': False,
 'see': False,
 'go': True,
 'time': True,
 'even': False,
 'play': False,
 'good': False,
 'take': False,
 'story': False,
 'would': True,
 'well': False,
 'much': False,
 'know': False,
 'two': True,
 'bad': True,
 'come': False,
 'first': False,
 'also': True,
 'look': False,
 'give': True,
 'seem': False,
 'end': False,
 'way': True,
 'work': False,
 '--': False,
 'plot': True,
 'really': False,
 'say': False,
 'life': True,
 'think': True,
 'find': True,
 'show': False,
 'people': True,
 'little': True,
 'star': False,
 'love': False,
 'man': False,
 'scene': False,
 'best': True,
 'could': False,
 'try': True,
 'never': True,
 'action': False,
 'new': False,
 'great': False,
 'big': False,
 'scenes': False,
 'want': False,
 'watch': False,
 'many': False,
 'become': True,
 'director': False,
 'movies': False,
 'act': False,
 'use': False,
 'another': False,
 'world': T

In [14]:
training_data = [(get_feature_dict(document), category) for document, category in training_documents]
testing_data = [(get_feature_dict(document), category) for document, category in training_documents]
training_data[1]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': True,
  'go': True,
  'time': True,
  'even': False,
  'play': True,
  'good': True,
  'take': True,
  'story': False,
  'would': False,
  'well': False,
  'much': False,
  'know': False,
  'two': False,
  'bad': True,
  'come': True,
  'first': False,
  'also': False,
  'look': False,
  'give': False,
  'seem': True,
  'end': True,
  'way': False,
  'work': True,
  '--': True,
  'plot': True,
  'really': True,
  'say': True,
  'life': False,
  'think': False,
  'find': False,
  'show': False,
  'people': False,
  'little': False,
  'star': False,
  'love': False,
  'man': False,
  'scene': True,
  'best': False,
  'could': False,
  'try': True,
  'never': False,
  'action': False,
  'new': False,
  'great': False,
  'big': False,
  'scenes': True,
  'want': True,
  'watch': True,
  'many': True,
  'become': False,
  'director': False,
  'movies': True,
  'act': 

In [15]:
#training the data using naiveBayesClassifier
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(training_data)

In [16]:
accuracy = nltk.classify.accuracy(classifier, testing_data)
accuracy

0.778

In [17]:
classifier.show_most_informative_features(15)

Most Informative Features
                atlantic = True              neg : pos    =      7.0 : 1.0
                  irving = True              pos : neg    =      5.7 : 1.0
                   heche = True              neg : pos    =      5.7 : 1.0
             bruckheimer = True              pos : neg    =      5.0 : 1.0
                    jinn = True              pos : neg    =      5.0 : 1.0
                    caan = True              neg : pos    =      3.8 : 1.0
                  donnie = True              neg : pos    =      3.8 : 1.0
                  conrad = True              pos : neg    =      3.7 : 1.0
                 hammond = True              pos : neg    =      3.7 : 1.0
                 forster = True              pos : neg    =      3.7 : 1.0
            untouchables = True              neg : pos    =      3.7 : 1.0
                     wcw = True              neg : pos    =      3.7 : 1.0
                 fincher = True              pos : neg    =      3.0 : 1.0

In [18]:
#Using sklearn classifiers on nltk datasets!
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [19]:
rfc = RandomForestClassifier()
sklearn_classifier = SklearnClassifier(rfc)

In [20]:
sklearn_classifier.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [21]:
nltk.classify.accuracy(sklearn_classifier, testing_data)

0.7986666666666666