In [1]:
# Import dependencies

In [2]:
import operator


from nltk.corpus import stopwords, sentiwordnet as swn  # requires nltk.download('vader_lexicon'), nltk.download('wordnet') and nltk.download('stopwords')
from string import punctuation

In [3]:
# Data import

In [4]:
with open('data/rt-polarity.pos', encoding='latin-1') as positive_file:
    positive_reviews = positive_file.readlines()
    
with open('data/rt-polarity.neg', encoding='latin-1') as negative_file:
    negative_reviews = negative_file.readlines()

print(positive_reviews[:3], negative_reviews[:3])

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n', 'effective but too-tepid biopic\n'] ['simplistic , silly and tedious . \n', "it's so laddish and juvenile , only teenage boys could possibly find it funny . \n", 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . \n']


In [5]:
# Naive classifier

In [6]:
def naive_score(word):
    meaning = next(swn.senti_synsets(word), None)  # grab the most common meaning
    return meaning.pos_score() - meaning.neg_score() if meaning else 0
    
def naive_classifier(sentence):
    word_list = sentence.lower().split()
    return sum(naive_score(word) for word in word_list)

# helpers
def get_accuracy(score_list, comparator):
    return len([x for x in score_list if comparator(x, 0)]) / len(score_list)

def get_highest_n(reviews, scoring_function=naive_classifier, n=10):
    scored_reviews = ((scoring_function(review), review) for review in reviews)
    return sorted(scored_reviews, key=lambda x: x[0], reverse=True)[:n]

In [7]:
positive_review_scores = [naive_classifier(review) for review in positive_reviews]
negative_review_scores = [naive_classifier(review) for review in negative_reviews]

print(positive_review_scores[:5])
print(negative_review_scores[:5])

[1.25, 0.75, 0.125, 1.125, 1.125]
[-0.375, 1.0, -1.0, 0.125, -1.125]


In [8]:
print(get_accuracy(positive_review_scores, operator.gt))
print(get_accuracy(negative_review_scores, operator.lt))

0.639654848996436
0.43275182892515474


In [9]:
for review in get_highest_n(negative_reviews):
    print(review)

(3.0, "it's frustrating to see these guys -- who are obviously pretty clever -- waste their talent on parodies of things they probably thought were funniest when they were high . \n")
(3.0, 'a film that will be best appreciated by those willing to endure its extremely languorous rhythms , waiting for happiness is ultimately thoughtful without having much dramatic impact . \n')
(3.0, 'the movie is so resolutely cobbled together out of older movies that it even uses a totally unnecessary prologue , just because it seems obligatory . \n')
(2.875, 'this dramatically shaky contest of wills only reiterates the old hollywood saw : evil is interesting and good is boring . \n')
(2.875, "the plot's clearly mythic structure may owe more to disney's strong sense of formula than to the original story . but while the highly predictable narrative falls short , treasure planet is truly gorgeous to behold . \n")
(2.75, 'i wish windtalkers had had more faith in the dramatic potential of this true story 

In [10]:
# (Not) Improved classifier: no stopwords, consider all meanings

In [11]:
def improved_score(word):
    return sum(meaning.pos_score() - meaning.neg_score() for meaning in swn.senti_synsets(word))
    
def improved_classifier(sentence, blacklist):
    word_list = sentence.lower().split()
    return sum(naive_score(word) for word in word_list if word not in blacklist)

In [12]:
blacklist = set(stopwords.words('english') + list(punctuation))

In [13]:
improved_positive_review_scores = [improved_classifier(review, blacklist) for review in positive_reviews]
improved_negative_review_scores = [improved_classifier(review, blacklist) for review in negative_reviews]

print(improved_positive_review_scores[:5])
print(improved_negative_review_scores[:5])

[1.125, 0.625, 0.125, 1.0, 1.25]
[-0.375, 1.0, -0.875, 0.125, -1.125]


In [14]:
print(get_accuracy(improved_positive_review_scores, operator.gt))
print(get_accuracy(improved_negative_review_scores, operator.lt))
# this was not really a good idea...

0.6430313262052147
0.4211217407615832
