In [1]:
# Initialise analyzer and demo

In [2]:
from nltk.sentiment import vader

In [3]:
si_analyzer = vader.SentimentIntensityAnalyzer()  # requires nltk.download('vader_lexicon')

In [4]:
test_strings = [
    ':D',
    ':/',
    'I like you',
    'I really like you',
    'I really like you but you are stupid',
    'I really like you but you are so stupid',
    'I really like you but you are not stupid',
    'I really like you, you are stupid',
]
for statement in test_strings:
    print(si_analyzer.polarity_scores(statement))

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.5106}
{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.34}
{'neg': 0.0, 'neu': 0.286, 'pos': 0.714, 'compound': 0.3612}
{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.4201}
{'neg': 0.4, 'neu': 0.435, 'pos': 0.165, 'compound': -0.5724}
{'neg': 0.434, 'neu': 0.43, 'pos': 0.136, 'compound': -0.7313}
{'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'compound': 0.6768}
{'neg': 0.334, 'neu': 0.392, 'pos': 0.274, 'compound': -0.1548}


In [5]:
# Data import

In [6]:
with open('data/rt-polarity.pos', encoding='latin-1') as positive_file:
    positive_reviews = positive_file.readlines()
    
with open('data/rt-polarity.neg', encoding='latin-1') as negative_file:
    negative_reviews = negative_file.readlines()

print(positive_reviews[:3], negative_reviews[:3])

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n', 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . \n', 'effective but too-tepid biopic\n'] ['simplistic , silly and tedious . \n', "it's so laddish and juvenile , only teenage boys could possibly find it funny . \n", 'exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . \n']


In [7]:
# Helper functions and some testing

In [8]:
def get_sentiment(statement):
    return si_analyzer.polarity_scores(statement)['compound']

def get_accuracy(score_list, operator): # terrible but fun you can pass an operator
    return len([x for x in score_list if operator(x, 0)]) / len(score_list)

In [9]:
positive_review_scores = [get_sentiment(review) for review in positive_reviews]
negative_review_scores = [get_sentiment(review) for review in negative_reviews]

In [10]:
import operator
print(get_accuracy(positive_review_scores, operator.gt))
print(get_accuracy(negative_review_scores, operator.lt))

0.6946163946726693
0.40105045957606456


In [11]:
def get_highest_n(reviews, scoring_function=get_sentiment, n=10):
    scored_reviews = ((scoring_function(review), review) for review in reviews)
    return sorted(scored_reviews, key=lambda x: x[0], reverse=True)[:n]

for review in get_highest_n(negative_reviews):
    print(review)

(0.9674, "it's inoffensive , cheerful , built to inspire the young people , set to an unending soundtrack of beach party pop numbers and aside from its remarkable camerawork and awesome scenery , it's about as exciting as a sunburn . \n")
(0.9511, "the plot's clearly mythic structure may owe more to disney's strong sense of formula than to the original story . but while the highly predictable narrative falls short , treasure planet is truly gorgeous to behold . \n")
(0.9501, 'the makers of divine secrets of the ya-ya sisterhood should offer a free ticket ( second prize , of course , two free tickets ) to anyone who can locate a genuinely honest moment in their movie . \n')
(0.9431, 'but buying into sham truths and routine " indie " filmmaking , freundlich has made just another safe movie . it\'s not horrible , just horribly mediocre . \n')
(0.9403, "it's mindless junk like this that makes you appreciate original romantic comedies like punch-drunk love . \n")
(0.9398, "the messages of c