In [1]:
# Import required Python libraries
import nltk
from nltk.corpus import movie_reviews

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

In [2]:
# Convert original lables to binary form
tag_mapping = {'neg': 0, 'pos': 1}

# Human readable labels
inv_tag_mapping = {0: 'Negative', 1: 'Positive'}

In [3]:
# Initialize vectorizer object which will convert our words to frequency matrix
# and as well normzalize words beforehand: convert them to lowercase, remove rare and frequent words, remove stop words
vectorizer = CountVectorizer(lowercase=True, stop_words='english', min_df=2, max_df=0.95, max_features=5000)

In [4]:
# Convert movie reviews to an array
reviews = []
labels = []

for review_path in movie_reviews.fileids():
    
    label = tag_mapping[review_path[:3]]
    labels.append(label)
    
    review = movie_reviews.raw(fileids=[review_path])
    reviews.append(review)

In [5]:
# Show part of 1st movie review (first 500 symbols)
reviews[0][:500]

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt'

In [6]:
# Label of 1st movie review
labels[0]

0

In [7]:
# Transform words to a feature matrix form
X_train = vectorizer.fit_transform(reviews)
X_train.shape

(2000, 5000)

In [8]:
# Initialize and train Bayesian classifier
clf = MultinomialNB()
clf.fit(X_train, labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
# Predict labels for a training set
# Check F1-score accuracy on a training set
predictions = clf.predict(X_train)
f1_score(predictions, labels)

0.9016811003565971

In [10]:
# Let's try to predict labels on some new data points
new_reviews = ['The movie was realy awesome!', 'Complete waste of time. Nothing speciall.']

X_test = vectorizer.transform(new_reviews)

new_predictions = clf.predict(X_test)

for review, label in zip(new_reviews, new_predictions):
    print('{0} :: label - {1}'.format(review, inv_tag_mapping[label]))

The movie was realy awesome! :: label - Positive
Complete waste of time. Nothing speciall. :: label - Negative
