# Movie Review Classification


In [35]:
from nltk.corpus import movie_reviews
 
# Total reviews
print (len(movie_reviews.fileids())) # Output: 2000
 
# Review categories
print (movie_reviews.categories()) # Output: [u'neg', u'pos']
 
# Total positive reviews
print (len(movie_reviews.fileids('pos'))) # Output: 1000
 
# Total negative reviews
print (len(movie_reviews.fileids('neg'))) # Output: 1000
 


2000
['neg', 'pos']
1000
1000


# Data cleaning and Feature selection

In [36]:
from nltk import ngrams
from nltk.corpus import stopwords 
import string
 
stopwords_english = stopwords.words('english')
 
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)    
    return words_clean 
 
# feature extractor function for unigram
def bag_of_words(words):    
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 
# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = dict([word, True] for word in words_ng)    
    return words_dictionary

# Combination of Unigrams and Bigrams Feature

In [44]:
 # let's define a new function that extracts all features
# i.e. that extracts both unigram and bigrams features
def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
 
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)
 
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
 
    return all_features
 
print (bag_of_all_words(words))

{'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}


In [45]:
from nltk.corpus import movie_reviews 
 
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)
    
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)
 


In [46]:
# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'pos'))
    
    
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'neg'))


In [47]:
print (len(pos_reviews_set), len(neg_reviews_set)) # Output: (1000, 1000)
 
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
 
test_set = pos_reviews_set[:100] + neg_reviews_set[:100]
train_set = pos_reviews_set[100:] + neg_reviews_set[100:]
 
print(len(test_set),  len(train_set)) 



1000 1000
200 1800


# Training Classifier

In [48]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) 
 
#print (classifier.show_most_informative_features(10))

0.81


In [54]:
from sklearn.metrics import confusion_matrix


testing_set_content=[i[0] for i in test_set]
golden_label=[i[1] for i in test_set]
tested_label=classifier.classify_many(testing_set_content)
#print (golden_label)
cm = confusion_matrix(golden_label,tested_label) 
print ("Confusion matrix",cm)


Confusion matrix [[67 33]
 [ 5 95]]


# Evaluation

In [55]:
from sklearn.metrics import classification_report

print(classification_report(golden_label, tested_label))

              precision    recall  f1-score   support

         neg       0.93      0.67      0.78       100
         pos       0.74      0.95      0.83       100

    accuracy                           0.81       200
   macro avg       0.84      0.81      0.81       200
weighted avg       0.84      0.81      0.81       200



# Testing

In [56]:
from nltk.tokenize import word_tokenize
 
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) # Output: neg
# Negative review correctly classified as negative
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: neg
print (prob_result.prob("neg")) # Output: 0.770612685688
print (prob_result.prob("pos")) # Output: 0.229387314312


custom_review = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
 
print (classifier.classify(custom_review_set)) # Output: pos
# Positive review correctly classified as positive
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: pos
print (prob_result.prob("neg")) # Output: 0.00677736186354
print (prob_result.prob("pos")) # Output: 0.993222638136
 
 


neg
<ProbDist with 2 samples>
neg
0.9236562814579945
0.07634371854200557
pos
<ProbDist with 2 samples>
pos
0.009277879420519029
0.9907221205794827
