In [3]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
#Download a corpus of pre-classified movie reviews

In [4]:
positiveReviewsFileName = "rt-polaritydata/rt-polaritydata/rt-polarity.pos"

In [5]:
with open(positiveReviewsFileName, 'r') as f: positiveReviews = f.readlines()

In [6]:
positiveReviews[0] 

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n'

In [7]:
len(positiveReviews)

5331

In [8]:
negativeReviewsFileName = "rt-polaritydata/rt-polaritydata/rt-polarity.neg"

In [9]:
with open(negativeReviewsFileName, 'r') as f: negativeReviews = f.readlines()

In [20]:
negativeReviews[0]

'simplistic , silly and tedious . \n'

In [21]:
len(negativeReviews)

5331

In [22]:
#Corpus of reviews is downloaded, let us apply vader to these reviews

In [10]:
from nltk.sentiment import vader

In [24]:
sia = vader.SentimentIntensityAnalyzer()

In [25]:
def VaderSentiment(review):
    return sia.polarity_scores(review)['compound']

In [27]:
review = "this is the best restaurant in the city."  #running a trial on the review

VaderSentiment(review)

0.6369

In [11]:
#Creating a function object to get score for reviews for all movies in the list at a stretch
def getReviewSentiments(sentimentCalculator):
    negReviewResult = [sentimentCalculator(oneNegativeReview) for oneNegativeReview in negativeReviews]
    posReviewResult = [sentimentCalculator(onePositiveReview) for onePositiveReview in positiveReviews]
    return {'results-on-positive': posReviewResult, 'result-on-negative': negReviewResult}

In [31]:
 VaderResults = getReviewSentiments(VaderSentiment)

In [33]:
len(VaderResults['result-on-negative'])

5331

In [34]:
VaderResults['result-on-negative'][0]

0.0258

In [12]:
def runDiagnostics(reviewResult):
    positiveReviewsResult = reviewResult['results-on-positive']
    negativeReviewsResult = reviewResult['result-on-negative']
    pctTruePositive = float(sum(x>0 for x in positiveReviewsResult))/len(positiveReviewsResult)
    pctTrueNegative = float(sum(x<0 for x in negativeReviewsResult))/len(negativeReviewsResult)
    totalAccurate = float(sum(x>0 for x in positiveReviewsResult)) + float(sum(x<0 for x in negativeReviewsResult))
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    Accuracy = float(totalAccurate)/total
    print 'Accuracy on positive reviews = '+'%.2f'% (pctTruePositive*100)+'%'
    print 'Accuracy on negative reviews = '+'%.2f'% (pctTrueNegative*100)+'%'
    print 'Overall Accuracy = '+'%.2f'% (Accuracy*100)+'%'

In [43]:
runDiagnostics(getReviewSentiments(VaderSentiment))

Accuracy on positive reviews = 69.44%
Accuracy on negative reviews = 40.09%
Overall Accuracy = 54.76%


In [45]:
#Overall accuracy in categorization of movie reviews using Vader is 54.76%
#now, we are going forward to build a similar rule-based sentiment analysis engine 
#using a Setinment lexicon - Sentiwordnet

In [13]:
from nltk.corpus import sentiwordnet as swn

In [47]:
#Checking the functioning of the sentiwordnet using a simple word dog

In [48]:
swn.senti_synsets('dog') #using senti_synsets method

[SentiSynset('dog.n.01'),
 SentiSynset('frump.n.01'),
 SentiSynset('dog.n.03'),
 SentiSynset('cad.n.01'),
 SentiSynset('frank.n.02'),
 SentiSynset('pawl.n.01'),
 SentiSynset('andiron.n.01'),
 SentiSynset('chase.v.01')]

In [49]:
swn.senti_synsets('dog')[3]

SentiSynset('cad.n.01')

In [51]:
swn.senti_synsets('dog')[3].pos_score() #finding the polarity using .pos_score() method

0.0

In [52]:
swn.senti_synsets('dog')[3].neg_score() #finding the polarity using .neg_score() method

1.0

In [53]:
#Using Sentiwordnet to implement a rule-based analyzer for movie reviews

In [74]:
def superNaiveSentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        weight = 0.0
        try:
            common_meaning = swn.senti_synsets(word)[0]
            if common_meaning.pos_score() > common_meaning.neg_score():
                weight = weight + common_meaning.pos_score()
            elif common_meaning.pos_score() < common_meaning.neg_score():
                weight = weight - common_meaning.neg_score()
        except:
            numExceptions = numExceptions + 1
        #print "Word:" + word + "weight:" + str(weight)
        
        reviewPolarity = reviewPolarity + weight
        return reviewPolarity
        

In [75]:
runDiagnostics(getReviewSentiments(superNaiveSentiment))

Accuracy on positive reviews = 7.58%
Accuracy on negative reviews = 9.14%
Overall Accuracy = 8.36%


In [14]:
from string import punctuation
from nltk.corpus import stopwords

In [65]:
stopwords = set(stopwords.words('english') + list(punctuation))

In [66]:
list(punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [67]:
stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 u'a',
 u'about',
 u'above',
 u'after',
 u'again',
 u'against',
 u'ain',
 u'all',
 u'am',
 u'an',
 u'and',
 u'any',
 u'are',
 u'aren',
 u"aren't",
 u'as',
 u'at',
 u'be',
 u'because',
 u'been',
 u'before',
 u'being',
 u'below',
 u'between',
 u'both',
 u'but',
 u'by',
 u'can',
 u'couldn',
 u"couldn't",
 u'd',
 u'did',
 u'didn',
 u"didn't",
 u'do',
 u'does',
 u'doesn',
 u"doesn't",
 u'doing',
 u'don',
 u"don't",
 u'down',
 u'during',
 u'each',
 u'few',
 u'for',
 u'from',
 u'further',
 u'had',
 u'hadn',
 u"hadn't",
 u'has',
 u'hasn',
 u"hasn't",
 u'have',
 u'haven',
 u"haven't",
 u'having',
 u'he',
 u'her',
 u'here',
 u'hers',
 u'herself',
 u'him',
 u'himself',
 u'his',
 u'how',
 u'i',
 u'if',
 u'in',
 u'into',
 u'is',
 u'isn',
 u"isn't",
 u'it',
 u"it's",
 u'its',
 u'itself',
 u'just',
 u'll',
 u'm',
 u'ma',
 u'me',
 u'mi

In [70]:
def NaiveSentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        numMeanings = 0
        if word in stopwords:
            continue
        weight = 0.0
        try:
            for meaning in swn.senti_synsets(word):
                if meaning.pos_score() > meaning.neg_score():
                    weight = weight + (meaning.pos_score() - meaning.neg_score())
                    numMeanings = numMeanings + 1
                elif common_meaning.pos_score() < common_meaning.neg_score():
                    weight = weight - (meaning.neg_score() - meaning.pos_score())
                    numMeanings = numMeanings + 1
        except:
            numExceptions = numExceptions + 1
        if numMeanings > 0:
            reviewPolarity = reviewPolarity + (weight/numMeanings)
    return reviewPolarity

In [76]:
runDiagnostics(getReviewSentiments(NaiveSentiment))

Accuracy on positive reviews = 87.04%
Accuracy on negative reviews = 0.00%
Overall Accuracy = 43.52%


In [77]:
#As we can see, this rule-based engine clearly does not work well for negative reviews.
#Now, we would use ML-based classifier to rectify this issue

In [15]:
def Sentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        numMeanings = 0
        weight = 0.0
        try:
            for meaning in swn.senti_synsets(word):
                if meaning.pos_score() > meaning.neg_score():
                    weight = weight + (meaning.pos_score() - meaning.neg_score())
                    numMeanings = numMeanings + 1
                elif common_meaning.pos_score() < common_meaning.neg_score():
                    weight = weight - (meaning.neg_score() - meaning.pos_score())
                    numMeanings = numMeanings + 1
        except:
            numExceptions = numExceptions + 1
        if numMeanings > 0:
            reviewPolarity = reviewPolarity + (weight/numMeanings)
    return reviewPolarity

In [16]:
runDiagnostics(getReviewSentiments(Sentiment))

Accuracy on positive reviews = 90.98%
Accuracy on negative reviews = 0.00%
Overall Accuracy = 45.49%
