In [3]:
import numpy as np
import nltk

In [8]:
def get_review(path, positive=True):
    label = 1 if positive else 0
    
    with open(path, 'r', encoding='latin-1') as f:
        review_txt = f.readlines()
    
    reviews = [(text, label) for text in review_txt]
    return reviews

In [6]:
def extract_reviews():
    positive_reviews = get_review('movie_data/rt-polarity.pos', positive=True)
    negative_reviews = get_review('movie_data/rt-polarity.neg', positive=False)
    
    return positive_reviews, negative_reviews

In [9]:
positive_reviews, negative_reviews = extract_reviews()
print(len(positive_reviews), len(negative_reviews))

5331 5331


In [11]:
positive_reviews[:2]

[('the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n',
  1),
 ('the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n',
  1)]

In [13]:
TRAIN_DATA = 5000
TOTAL_DATA = len(positive_reviews)

train_reviews = positive_reviews[:TRAIN_DATA] + negative_reviews[:TRAIN_DATA]
test_positive_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA] 
test_negative_reviews = negative_reviews[TRAIN_DATA:TOTAL_DATA]


In [14]:
def get_vocabulary(train_reviews):
    words_set = set()
    for review, _ in train_reviews:
        words_set.update(review.split())
    
    return list(words_set)

In [15]:
vocabulary = get_vocabulary(train_reviews)
len(vocabulary)

20704

In [16]:
vocabulary[:5]

['underrated', 're-creation', 'lunar', 'plaintiveness', 'grand-scale']

In [17]:
def extract_features(review_text):
    review_words = set(review_text.split())
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
    
    return features

In [18]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)
trained_classifier = nltk.NaiveBayesClassifier.train(train_features) 

In [19]:
def sentiment_calculator(review_text):
    features = extract_features(review_text)
    return trained_classifier.classify(features)

In [20]:
sentiment_calculator('What an amazing movie!')

1

In [21]:
sentiment_calculator('What a terrible movie!')

0

In [22]:
def classify_test_reviews(pos_reviews, neg_reviews, sentiment_calculator):
    pos = [sentiment_calculator(review) for review, _ in pos_reviews]
    neg = [sentiment_calculator(review) for review, _ in neg_reviews]
    true_pos = sum(x > 0 for x in pos)
    true_neg = sum(x == 0 for x in neg)
    
    percent_true_pos = float(true_pos) / len(pos)
    percent_true_neg = float(true_neg) / len(neg)
    total_accurate = true_pos + true_neg
    total = len(pos) + len(neg)
    
    print("Accuracy on positive reviews = " + "%.2f" % (percent_true_pos * 100) + "%")
    print("Accuracy on negative reviews = " + "%.2f" % (percent_true_neg * 100) + "%")
    print("Overall accuracy = " + "%.2f" % (total_accurate * 100 / total) + "%")

In [23]:
classify_test_reviews(test_positive_reviews,test_negative_reviews,sentiment_calculator)

Accuracy on positive reviews = 78.25%
Accuracy on negative reviews = 80.66%
Overall accuracy = 79.46%


In [32]:
# IMDB dataset
import os
import tarfile
from six.moves import urllib
DOWNLODED_FILE_NAME = 'ImdbReviews.tar.gz'
def download_imdb_dataset(url):
    if not os.path.isfile(DOWNLODED_FILE_NAME):
        filename, _ = urllib.request.urlretrieve(url, DOWNLODED_FILE_NAME)

In [26]:
import re
TOKEN_REGEX = re.compile("[^A-Za-z0-9 ]+")
def get_reviews(data_dir, positive=True):
    label = 1 if positive else 0
    reviews = []
    
    for file in os.listdir(data_dir):
        if file.endswith(".txt"):
            with open(data_dir + file, 'r+') as f:
                review = f.read().decode('utf-8')
                review = review.lower().replace("<br />", " ")
                review = re.sub(TOKEN_REGEX, '', review)
                reviews.append((review, label))
    return reviews

def extract_reviews():
    if not os.path.exists('aclImdb'):
        with tarfile.open(DOWNLODED_FILE_NAME) as tar:
            tar.extractall()
            tar.close()
    
    pos = get_reviews('aclImdb/train/pos/', positive=True)
    neg = get_reviews('aclImdb/train/neg/', positive=False)
    
    return pos, neg


In [34]:
URL_PATH = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
download_imdb_dataset(URL_PATH)

In [35]:
positive_reviews, negative_reviews = extract_reviews()

KeyboardInterrupt: 