In [1]:
import nltk
from nltk.corpus import movie_reviews

In [2]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
len(movie_reviews.words())

1583820

In [4]:
movie_reviews.categories()

['neg', 'pos']

In [5]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [6]:
text = " ".join(movie_reviews.words())
import string
text_filtered = text.translate(str.maketrans('','', string.punctuation))

In [7]:
text_filtered[:1001]

'plot  two teen couples go to a church party  drink and then drive  they get into an accident  one of the guys dies  but his girlfriend continues to see him in her life  and has nightmares  what  s the deal  watch the movie and  sorta  find out    critique  a mind  fuck movie for the teen generation that touches on a very cool idea  but presents it in a very bad package  which is what makes this review an even harder one to write  since i generally applaud films which attempt to break the mold  mess with your head and such  lost highway  memento   but there are good and bad ways of making all types of films  and these folks just didn  t snag this one correctly  they seem to have taken this pretty neat concept  but executed it terribly  so what are the problems with the movie  well  its main problem is that it  s simply too jumbled  it starts off  normal  but then downshifts into this  fantasy  world in which you  as an audience member  have no idea what  s going on  there are dreams  t

In [8]:
from nltk import word_tokenize
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
tokens = word_tokenize(text_filtered)
word_filtered = [w.lower() for w in tokens if w not in stopwords]

In [9]:
word_filtered[:10]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get']

In [10]:
counter_dict = nltk.FreqDist(word_filtered)

In [11]:
print(counter_dict.most_common(15))

[('film', 9519), ('one', 5853), ('movie', 5774), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2170), ('would', 2110), ('much', 2050), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1912), ('well', 1906)]


In [12]:
docs = [(list(movie_reviews.words(fileid)), category)
       for category in movie_reviews.categories()
       for fileid in movie_reviews.fileids(category)]

In [13]:
print(docs[1])

(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', 'it', "'", 's', 'got', 'a', 'head', 'start', 'in', 'this', 'movie', 'starring', 'jamie', 'lee', 'curtis', 'and', 'another', 'baldwin', 'brother', '(', 'william', 'this', 'time', ')', 'in', 'a', 'story', 'regarding', 'a', 'crew', 'of', 'a', 'tugboat', 'that', 'comes', 'across', 'a', 'deserted', 'russian', 'tech', 'ship', 'that', 'has', 'a', 'strangeness', 'to', 'it', 'when', 'they', 'kick', 'the', 'power', 'back', 'on', '.', 'little', 'do', 'they', 'know', 'the', 'power', 'within', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', 'don', "'", 't', 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', 'we', 'don', "'", 't'

In [14]:
print(docs[0])

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'b

##### Feature extraction

In [108]:
word_features = [w[0] for w in counter_dict.most_common(300)]

In [109]:
word_features[:10]

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much']

In [110]:
len(word_features)

300

In [111]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [112]:
search_features(word_features[:4])

{'film': True,
 'one': True,
 'movie': True,
 'like': True,
 'even': False,
 'good': False,
 'time': False,
 'story': False,
 'would': False,
 'much': False,
 'character': False,
 'also': False,
 'get': False,
 'two': False,
 'well': False,
 'characters': False,
 'first': False,
 'see': False,
 'way': False,
 'make': False,
 'life': False,
 'really': False,
 'films': False,
 'plot': False,
 'little': False,
 'people': False,
 'could': False,
 'scene': False,
 'bad': False,
 'man': False,
 'never': False,
 'best': False,
 'new': False,
 'scenes': False,
 'many': False,
 'director': False,
 'know': False,
 'movies': False,
 'action': False,
 'great': False,
 'another': False,
 'love': False,
 'go': False,
 'made': False,
 'us': False,
 'big': False,
 'end': False,
 'something': False,
 'back': False,
 'still': False,
 'world': False,
 'seems': False,
 'work': False,
 'makes': False,
 'however': False,
 'every': False,
 'though': False,
 'better': False,
 'real': False,
 'audience': False

In [113]:
print(search_features(docs[0][0]))

{'film': True, 'one': True, 'movie': True, 'like': True, 'even': True, 'good': True, 'time': False, 'story': False, 'would': True, 'much': False, 'character': True, 'also': True, 'get': True, 'two': True, 'well': True, 'characters': True, 'first': False, 'see': True, 'way': True, 'make': True, 'life': True, 'really': True, 'films': True, 'plot': True, 'little': True, 'people': True, 'could': False, 'scene': False, 'bad': True, 'man': False, 'never': False, 'best': False, 'new': True, 'scenes': True, 'many': False, 'director': True, 'know': True, 'movies': True, 'action': False, 'great': False, 'another': False, 'love': False, 'go': True, 'made': False, 'us': True, 'big': True, 'end': False, 'something': False, 'back': True, 'still': True, 'world': True, 'seems': True, 'work': False, 'makes': True, 'however': False, 'every': True, 'though': False, 'better': False, 'real': False, 'audience': True, 'enough': False, 'seen': False, 'take': False, 'around': False, 'going': True, 'year': Fals

In [114]:
print(search_features(docs[1][1]))

{'film': False, 'one': False, 'movie': False, 'like': False, 'even': False, 'good': False, 'time': False, 'story': False, 'would': False, 'much': False, 'character': False, 'also': False, 'get': False, 'two': False, 'well': False, 'characters': False, 'first': False, 'see': False, 'way': False, 'make': False, 'life': False, 'really': False, 'films': False, 'plot': False, 'little': False, 'people': False, 'could': False, 'scene': False, 'bad': False, 'man': False, 'never': False, 'best': False, 'new': False, 'scenes': False, 'many': False, 'director': False, 'know': False, 'movies': False, 'action': False, 'great': False, 'another': False, 'love': False, 'go': False, 'made': False, 'us': False, 'big': False, 'end': False, 'something': False, 'back': False, 'still': False, 'world': False, 'seems': False, 'work': False, 'makes': False, 'however': False, 'every': False, 'though': False, 'better': False, 'real': False, 'audience': False, 'enough': False, 'seen': False, 'take': False, 'aroun

In [115]:
len(search_features(docs[0][0]))

300

In [116]:
count_t = 0
count_f = 0
for each in search_features(docs[0][0]).items():
    if each[1] == True: 
        count_t += 1
    else: 
        count_f += 1
        
print("True:", count_t)
print("False:", count_f)

True: 91
False: 209


In [117]:
max_index = 0
min_index = 0
for i in range(len(docs)):
    length = len(docs[i][0])
    if length > len(docs[max_index][0]):
        max_index = i
    if length < len(docs[min_index][0]):
        min_index = i
print(max_index)
print(min_index)

1552
506


In [118]:
len(docs[max_index][0])

2879

In [119]:
featureset = [(search_features(doc), category) for (doc, category) in docs]

In [120]:
featureset[:1]

[({'film': True,
   'one': True,
   'movie': True,
   'like': True,
   'even': True,
   'good': True,
   'time': False,
   'story': False,
   'would': True,
   'much': False,
   'character': True,
   'also': True,
   'get': True,
   'two': True,
   'well': True,
   'characters': True,
   'first': False,
   'see': True,
   'way': True,
   'make': True,
   'life': True,
   'really': True,
   'films': True,
   'plot': True,
   'little': True,
   'people': True,
   'could': False,
   'scene': False,
   'bad': True,
   'man': False,
   'never': False,
   'best': False,
   'new': True,
   'scenes': True,
   'many': False,
   'director': True,
   'know': True,
   'movies': True,
   'action': False,
   'great': False,
   'another': False,
   'love': False,
   'go': True,
   'made': False,
   'us': True,
   'big': True,
   'end': False,
   'something': False,
   'back': True,
   'still': True,
   'world': True,
   'seems': True,
   'work': False,
   'makes': True,
   'however': False,
   'every

In [121]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [135]:
training_set[0]

({'film': True,
  'one': True,
  'movie': True,
  'like': True,
  'even': True,
  'good': True,
  'time': False,
  'story': False,
  'would': True,
  'much': False,
  'character': True,
  'also': True,
  'get': True,
  'two': True,
  'well': True,
  'characters': True,
  'first': False,
  'see': True,
  'way': True,
  'make': True,
  'life': True,
  'really': True,
  'films': True,
  'plot': True,
  'little': True,
  'people': True,
  'could': False,
  'scene': False,
  'bad': True,
  'man': False,
  'never': False,
  'best': False,
  'new': True,
  'scenes': True,
  'many': False,
  'director': True,
  'know': True,
  'movies': True,
  'action': False,
  'great': False,
  'another': False,
  'love': False,
  'go': True,
  'made': False,
  'us': True,
  'big': True,
  'end': False,
  'something': False,
  'back': True,
  'still': True,
  'world': True,
  'seems': True,
  'work': False,
  'makes': True,
  'however': False,
  'every': True,
  'though': False,
  'better': False,
  'real':

In [122]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [123]:
print("classifier's accuracy is: {}". format(nltk.classify.accuracy(classifier, testing_set)*100))

classifier's accuracy is: 70.5


In [124]:
print("classifier's accuracy in training set is: {}". format(nltk.classify.accuracy(classifier, training_set)*100))

classifier's accuracy in training set is: 78.75


In [125]:
(90.25-73.75)/73.75

0.22372881355932203

In [126]:
classifier.show_most_informative_features(10)

Most Informative Features
                 perfect = True              pos : neg    =      2.2 : 1.0
                     bad = True              neg : pos    =      1.9 : 1.0
                     war = True              pos : neg    =      1.8 : 1.0
           unfortunately = True              neg : pos    =      1.8 : 1.0
                    true = True              pos : neg    =      1.8 : 1.0
            performances = True              pos : neg    =      1.8 : 1.0
                american = True              pos : neg    =      1.7 : 1.0
                  reason = True              neg : pos    =      1.7 : 1.0
                   maybe = True              neg : pos    =      1.7 : 1.0
                  family = True              pos : neg    =      1.7 : 1.0


In [127]:
import pickle
save_classifier = open("naive_bayes_model.pkl", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [129]:
import joblib

# save
joblib.dump(classifier, "naive_bayes_model_2.pkl") 

['naive_bayes_model_2.pkl']

In [130]:
classifier_f = open("naive_bayes_model.pkl", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [164]:
custom_review = "I hated the restaurant. It was a disaster eating there. Poor service, arrogant waiters."
# custom_review = "I  love the restaurant. I love the food. I like the staff and their gesture toward customers"

In [165]:
from nltk import word_tokenize
import re

custom_review = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`\{|}~]','', custom_review)
custom_review_tokens = word_tokenize(custom_review)
custom_review_tokens

['I',
 'hated',
 'the',
 'restaurant',
 'It',
 'was',
 'a',
 'disaster',
 'eating',
 'there',
 'Poor',
 'service',
 'arrogant',
 'waiters']

In [166]:
def document_features(tokens_):
    word_filtered = [w.lower() for w in tokens_ if w not in stopwords]
    featureset = search_features(word_filtered)
    return featureset

In [167]:
custom_review_set = document_features((custom_review_tokens))
custom_review_set

{'film': False,
 'one': False,
 'movie': False,
 'like': False,
 'even': False,
 'good': False,
 'time': False,
 'story': False,
 'would': False,
 'much': False,
 'character': False,
 'also': False,
 'get': False,
 'two': False,
 'well': False,
 'characters': False,
 'first': False,
 'see': False,
 'way': False,
 'make': False,
 'life': False,
 'really': False,
 'films': False,
 'plot': False,
 'little': False,
 'people': False,
 'could': False,
 'scene': False,
 'bad': False,
 'man': False,
 'never': False,
 'best': False,
 'new': False,
 'scenes': False,
 'many': False,
 'director': False,
 'know': False,
 'movies': False,
 'action': False,
 'great': False,
 'another': False,
 'love': False,
 'go': False,
 'made': False,
 'us': False,
 'big': False,
 'end': False,
 'something': False,
 'back': False,
 'still': False,
 'world': False,
 'seems': False,
 'work': False,
 'makes': False,
 'however': False,
 'every': False,
 'though': False,
 'better': False,
 'real': False,
 'audience': F

In [168]:
print(classifier.classify(custom_review_set))

neg


In [170]:
prob_result = classifier.prob_classify(custom_review_set)
print(prob_result.max())
print(prob_result.prob('pos'))
print(prob_result.prob('neg'))

neg
0.024027833029862212
0.9759721669701329


# Using SVM

In [174]:
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier

classifier = SklearnClassifier(LinearSVC())
classifier.train(training_set)



<SklearnClassifier(LinearSVC())>

In [175]:
print("classifier's accuracy is: {}". format(nltk.classify.accuracy(classifier, testing_set)*100))

classifier's accuracy is: 65.75


In [176]:
print("classifier's accuracy in training set is: {}". format(nltk.classify.accuracy(classifier, training_set)*100))

classifier's accuracy in training set is: 85.5
