In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
len(movie_reviews.fileids('neg'))

1000

In [4]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
import random
random.shuffle(documents)

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
from nltk import pos_tag
w="better"
pos_tag([w])

[('better', 'RBR')]

In [10]:
from nltk.corpus import stopwords
stop=stopwords.words('english')
import string
punctuations=list(string.punctuation)
stop=stop + punctuations

In [11]:
from nltk.stem import WordNetLemmatizer
lm=WordNetLemmatizer()

In [12]:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stop:
            pos=pos_tag([w])
            clean_word=lm.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words
 
            

In [13]:
documents=[(clean_review(document),category) for document,category in documents]

In [14]:
training_documents=documents[0:1500]
testing_documents=documents[1500:]

In [15]:
all_words=[]
for doc in training_documents:
    all_words+= doc[0]

In [16]:
import nltk
freq=nltk.FreqDist(all_words)
common=freq.most_common(1000)
features=[i[0] for i in common]

In [17]:
def get_feature_dict(words):
    current_features={}
    words_set=set(words)
    for w in features:
        current_features[w]=w in words_set
    return current_features

In [18]:
get_feature_dict(training_documents[0][0])

{'film': True,
 'movie': False,
 'one': True,
 'make': False,
 'like': True,
 'character': True,
 'get': True,
 'see': False,
 'go': True,
 'time': True,
 'well': True,
 'scene': False,
 'even': True,
 'good': True,
 'story': False,
 'take': True,
 'would': True,
 'much': False,
 'come': True,
 'two': False,
 'bad': True,
 'look': False,
 'also': True,
 'give': False,
 'first': False,
 'know': True,
 'life': True,
 'way': False,
 'seem': True,
 'end': False,
 '--': False,
 'year': True,
 'work': False,
 'thing': True,
 'plot': False,
 'play': False,
 'really': False,
 'little': True,
 'people': True,
 'say': False,
 'show': False,
 'could': False,
 'love': False,
 'man': True,
 'never': False,
 'director': True,
 'best': False,
 'new': True,
 'star': False,
 'try': False,
 'performance': False,
 'big': True,
 'great': False,
 'many': False,
 'action': False,
 'actor': False,
 'find': False,
 'want': True,
 'u': False,
 'watch': True,
 'role': False,
 'think': True,
 'act': False,
 'ano

In [19]:
training_data=[(get_feature_dict(doc),category)for doc,category in training_documents]

In [20]:
testing_data=[(get_feature_dict(doc),category)for doc,category in testing_documents]

In [21]:
from nltk import NaiveBayesClassifier

In [22]:
classifier=NaiveBayesClassifier.train(training_data)

In [23]:
nltk.classify.accuracy(classifier,testing_data)

0.764

In [24]:
classifier.show_most_informative_features(15)

Most Informative Features
                   awful = True              neg : pos    =      6.0 : 1.0
              ridiculous = True              neg : pos    =      5.9 : 1.0
                   waste = True              neg : pos    =      5.2 : 1.0
               memorable = True              pos : neg    =      3.9 : 1.0
                    mess = True              neg : pos    =      3.7 : 1.0
                  stupid = True              neg : pos    =      3.6 : 1.0
                  truman = True              pos : neg    =      3.4 : 1.0
                  boring = True              neg : pos    =      3.4 : 1.0
                terrible = True              neg : pos    =      3.2 : 1.0
               spielberg = True              pos : neg    =      3.0 : 1.0
                  subtle = True              pos : neg    =      3.0 : 1.0
               perfectly = True              pos : neg    =      3.0 : 1.0
                 cameron = True              pos : neg    =      3.0 : 1.0

In [25]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [26]:
svc=SVC()
classifier=SklearnClassifier(svc)

In [27]:
classifier.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [28]:
nltk.classify.accuracy(classifier,testing_data)

0.806

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
rfc=RandomForestClassifier()
classifier=SklearnClassifier(rfc)

In [31]:
classifier.train(training_data)



<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [32]:
nltk.classify.accuracy(classifier,testing_data)

0.66