In [None]:
import nltk
from nltk.corpus import movie_reviews

In [3]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [4]:
len(movie_reviews.words())

1583820

In [5]:
#types of reviews 
movie_reviews.categories()

['neg', 'pos']

In [6]:
#15 most common words in reviews
nltk.FreqDist(movie_reviews.words()).most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [8]:
#adding all words to an empty list 
# in lower case
allwords = []
for i in movie_reviews.words():
    allwords.append(i.lower())

In [11]:
#list of all unique words in reviews
allwords = list(nltk.FreqDist(allwords))

In [13]:
len(allwords)

39768

In [14]:
#creating a word vector with top 1000 words
word_vector = allwords[:1000]

In [22]:
#comparing words in a random review with
#the word vector
def find_feature(word_list):
    feature = {}
    for i in word_vector:
        feature[i] = i in word_list
    return feature

In [17]:
#adding all reviews to a document with its tag ie, positive or negative
document = []
for file_id in movie_reviews.fileids():
    for category in movie_reviews.categories(file_id):
        document.append((movie_reviews.words(file_id),category))

In [19]:
len(document)

2000

In [28]:
#checking if the top 1000 words are found in each review and building feature sets
feature_sets = [(find_feature(word_list),category) for (word_list,category) in document]

In [29]:
# svc
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection

train_set,test_set = model_selection.train_test_split(feature_sets,test_size = 0.20)

In [30]:
print(len(train_set))
print(len(test_set))

# 1600
# 400

1600
400


In [31]:
model = SklearnClassifier(SVC(kernel = "rbf"))
model.train(train_set)

<SklearnClassifier(SVC())>

In [41]:
accuracy = nltk.classify.accuracy(model, test_set)
print("SVC Accuracy : {}".format(accuracy))

# SVC Accuracy : 0.8275

SVC Accuracy : 0.8275


In [34]:
#naives bayes
nb = nltk.NaiveBayesClassifier.train(train_set)

In [36]:
accuracy_nb = nltk.classify.accuracy(nb,test_set)
print("NB Accuracy : {}".format(accuracy_nb))

# NB Accuracy : 0.815

NB Accuracy : 0.815


In [40]:
nb.show_most_informative_features(15)

Most Informative Features
               memorable = True              pos : neg    =      5.5 : 1.0
                   worst = True              neg : pos    =      4.9 : 1.0
                  truman = True              pos : neg    =      4.4 : 1.0
                    mess = True              neg : pos    =      3.5 : 1.0
                  boring = True              neg : pos    =      3.5 : 1.0
                  stupid = True              neg : pos    =      3.4 : 1.0
               brilliant = True              pos : neg    =      3.1 : 1.0
               perfectly = True              pos : neg    =      3.0 : 1.0
               excellent = True              pos : neg    =      3.0 : 1.0
               effective = True              pos : neg    =      2.7 : 1.0
                   fails = True              neg : pos    =      2.6 : 1.0
                   worse = True              neg : pos    =      2.6 : 1.0
                  minute = True              neg : pos    =      2.5 : 1.0

In [43]:
#saving featuresets as pickle
import pickle

save_featuresets = open("featuresets.pickle","wb")
pickle.dump(feature_sets,save_featuresets)
save_featuresets.close()