In [2]:
from nltk.corpus import movie_reviews

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [5]:
# reading words of a single review
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
# Creating a list of documents where each index will be a tuple consisting of words in that document and the type of review
documents = []
for category in movie_reviews.categories():
    # Get fileids of each category
    for fileid in movie_reviews.fileids(category):
        # Add to list
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
# Shuffling the list of documents to create random order for negs and pos
import random
random.shuffle(documents)
documents[0:5]

[(['jerry', 'springer', 'has', 'got', 'nothing', 'on', ...], 'pos'),
 (['what', 'were', 'they', 'thinking', '?', 'nostalgia', ...], 'neg'),
 (['forget', 'get', 'carter', '.', 'instead', '.', '.', ...], 'neg'),
 (['the', 'ring', 'is', 'probably', 'one', 'of', 'the', ...], 'pos'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg')]

In [8]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.corpus import wordnet
# Map pos tag list to simple pos for lemmatizer
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
from nltk import pos_tag
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
puncs = list(string.punctuation)
stops.update(puncs)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [11]:
# Cleaning the documents
def clean_review(words):
    output_words = []
    for w in words:
        # Remove stop words
        if w.lower() not in stops:
            # Lemmatize each word
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [12]:
# Clean each documnet using utility function
documents = [(clean_review(document), category) for document, category in documents]

In [13]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]

In [14]:
# Building the feature set
# First make array of all words and take the top K words
# Then make the feature set for all docs where for each doc
# Feature set includes freqs of top K words and the category
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [15]:
len(all_words)

526008

In [16]:
import nltk

In [17]:
# Frequency distributions
freqs = nltk.FreqDist(all_words)
common = freqs.most_common(3000)
# First 3000 words as features
features = [i[0] for i in common]

In [18]:
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'much',
 'would',
 'also',
 'give',
 'come',
 'life',
 'bad',
 'two',
 'seem',
 'way',
 'look',
 '--',
 'first',
 'end',
 'know',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'play',
 'little',
 'show',
 'really',
 'people',
 'could',
 'never',
 'love',
 'man',
 'director',
 'try',
 'best',
 'star',
 'great',
 'performance',
 'big',
 'actor',
 'new',
 'watch',
 'want',
 'find',
 'action',
 'many',
 'u',
 'role',
 'think',
 'another',
 'act',
 'turn',
 'back',
 'still',
 'audience',
 'something',
 'world',
 'set',
 'old',
 'day',
 'every',
 'however',
 'use',
 'cast',
 'part',
 'begin',
 'real',
 'guy',
 'enough',
 'though',
 'feel',
 'comedy',
 'may',
 'last',
 'young',
 'point',
 'interest',
 'john',
 'fact',
 'long',
 'around',
 'right',
 'write',
 'minute',
 'woman',
 'run',
 'nothing',
 'funny',
 'name',
 'script',
 'actually',
 'scree

In [19]:
# Now create a dictionary for each document
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        #  If current feature in the document
        current_features[w] = w in words_set
    return current_features

In [20]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]

In [21]:
training_data[0]

({'film': True,
  'movie': True,
  'one': False,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': True,
  'go': True,
  'time': True,
  'well': True,
  'scene': True,
  'even': True,
  'good': True,
  'story': False,
  'take': True,
  'much': True,
  'would': True,
  'also': True,
  'give': True,
  'come': True,
  'life': False,
  'bad': False,
  'two': True,
  'seem': True,
  'way': False,
  'look': False,
  '--': True,
  'first': False,
  'end': True,
  'know': False,
  'year': False,
  'work': True,
  'thing': True,
  'plot': True,
  'say': False,
  'play': True,
  'little': False,
  'show': True,
  'really': True,
  'people': False,
  'could': True,
  'never': True,
  'love': False,
  'man': False,
  'director': False,
  'try': True,
  'best': True,
  'star': False,
  'great': True,
  'performance': False,
  'big': False,
  'actor': True,
  'new': True,
  'watch': True,
  'want': True,
  'find': True,
  'action': False,
  'many': False,
  'u': False,
  '

In [22]:
testing_data = [(get_feature_dict(doc), category) for doc, category in training_documents]

In [23]:
from nltk import NaiveBayesClassifier

In [24]:
classifier = NaiveBayesClassifier.train(training_data)

In [25]:
nltk.classify.accuracy(classifier, testing_data)

0.872

In [26]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     10.7 : 1.0
                   anger = True              pos : neg    =      8.3 : 1.0
              schumacher = True              neg : pos    =      8.1 : 1.0
             outstanding = True              pos : neg    =      8.1 : 1.0
            breathtaking = True              pos : neg    =      8.0 : 1.0
                  turkey = True              neg : pos    =      7.4 : 1.0
                   crowe = True              pos : neg    =      7.2 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                   jolie = True              neg : pos    =      6.8 : 1.0
                  welles = True              neg : pos    =      6.8 : 1.0
                   awful = True              neg : pos    =      6.7 : 1.0
                  seagal = True              neg : pos    =      6.4 : 1.0
                   damme = True              neg : pos    =      6.4 : 1.0

In [27]:
testing_data == training_data

True

## Using NLTK with Sklearn

In [30]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [32]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [33]:
classifier_sklearn.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [34]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.7973333333333333

## Getting data into sklearn format

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
train_set = {"the sky sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [41]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [43]:
# Combining each document into a single sentence and labels 
categories = [category for document, category in documents]

In [44]:
text_documents = [" ".join(document) for document, category in documents]

In [45]:
text_documents[0]

'jerry springer get nothing wild thing john mcnaughton new thriller tackle tawdry theme less two hour springer notoriously sleazy talk show broadcast two week -- bisexuality threesome poolside catfights slutty rich bimbo even redneck gator wrestling part movie raucous complex storyline even trash tv topicality drag wild thing -- crazy campfest play like something find late night usa network infinitely palatable solid ensemble cast despite smatter needle scene sexual nature wicked fun wild thing would guilty pleasure guilty feel involve good time high school guidance counselor sam lombardo matt dillon well like town blue bay especially pretty popular kelly van ryan denise richards whose family name among florida yacht enclave financially prominent hop take crush physical level kelly seductively slinks lombardo house wash jeep fundraiser next day tearfully admits trollop mother theresa russell rap long blue bay detective ray duquette kevin bacon gloria perez daphne rubin vega listen simi

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [52]:
count_vec = CountVectorizer(max_features=2000)

In [53]:
# this is our x_train feature set
a = count_vec.fit_transform(x_train)
a.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 2, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 3, 0, 0]], dtype=int64)

In [54]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1995',
 '1997',
 '1998',
 '1999',
 '20',
 '2001',
 '30',
 '50',
 '54',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accidentally',
 'accompany',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anna',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'a

In [55]:
x_test_features = count_vec.transform(x_test)

In [56]:
x_test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)