In [1]:
# Import dependencies

In [24]:
from nltk import NaiveBayesClassifier
from nltk.classify import apply_features

In [3]:
# Data import

In [4]:
with open('data/rt-polarity.pos', encoding='latin-1') as positive_file:
    positive_reviews = positive_file.readlines()
    
with open('data/rt-polarity.neg', encoding='latin-1') as negative_file:
    negative_reviews = negative_file.readlines()

print(positive_reviews[:3], negative_reviews[:3])

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n', 'effective but too-tepid biopic\n'] ['simplistic , silly and tedious . \n', "it's so laddish and juvenile , only teenage boys could possibly find it funny . \n", 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . \n']


In [5]:
# Split into a training set

In [9]:
train_ratio = round(len(positive_reviews) * 0.7)

train_positive_reviews = positive_reviews[:train_ratio]
train_negative_reviews = negative_reviews[:train_ratio]

test_positive_reviews = positive_reviews[train_ratio:]
test_negative_reviews = negative_reviews[train_ratio:]

train_vocabulary = set([word for phrase in train_positive_reviews + train_negative_reviews for word in phrase.split()])

In [12]:
# Manipulate data into acceptable format

In [18]:
def get_tagged_tuple(line, sentiment):
    return (line.split(), sentiment)

train_data = [get_tagged_tuple(line, 'positive') for line in train_positive_reviews] \
           + [get_tagged_tuple(line, 'negative') for line in train_negative_reviews]

In [25]:
# Create a feature extraction callback and helpers

In [39]:
def feature_extractor(review):
    review_set = set(review)
    return {word: word in review_set for word in train_vocabulary}

def classify_review(review, classifier):
    prepared_review = feature_extractor(review.split())
    return classifier.classify(prepared_review)

In [29]:
# Train model

In [31]:
training_features = apply_features(feature_extractor, train_data)
classifier = NaiveBayesClassifier.train(training_features) # long, 2-5 minutes

In [47]:
print(classify_review(test_positive_reviews[10], classifier), test_positive_reviews[10])

positive  . . . fuses the events of her life with the imagery in her paintings so vividly that the artist's work may take on a striking new significance for anyone who sees the film . 



In [49]:
# Accuracy scoring

In [56]:
positive_accuracy = len([True for review in test_positive_reviews if classify_review(review, classifier) == 'positive']) / len(test_positive_reviews)
negative_accuracy = len([True for review in test_negative_reviews if classify_review(review, classifier) == 'negative']) / len(test_negative_reviews)

print(positive_accuracy)
print(negative_accuracy)


0.7498436522826767
0.7792370231394622


In [57]:
# Save the trained classifier

In [58]:
import pickle

with open('naive_classifier.pickle', 'wb') as file:
    pickle.dump(classifier, file)