In [49]:
import nltk
from nltk.sentiment import vader

### Using vader library for the sentiment analysis

In [50]:
sia = vader.SentimentIntensityAnalyzer()

In [51]:
sia.polarity_scores("What a terrible restaurant!!")

{'neg': 0.648, 'neu': 0.352, 'pos': 0.0, 'compound': -0.5696}

In [52]:
sia.polarity_scores("terrible")

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.4767}

In [53]:
sia.polarity_scores(":)")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4588}

In [54]:
sia.polarity_scores("the cumin was the kiss of death")

{'neg': 0.5, 'neu': 0.5, 'pos': 0.0, 'compound': -0.6124}

In [55]:
sia.polarity_scores("the food was good")

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [56]:
sia.polarity_scores("the food was good!!")

{'neg': 0.0, 'neu': 0.463, 'pos': 0.537, 'compound': 0.5399}

In [57]:
sia.polarity_scores("the food was not good!!")

{'neg': 0.428, 'neu': 0.572, 'pos': 0.0, 'compound': -0.457}

In [58]:
sia.polarity_scores("the food was not the worst!!")

{'neg': 0.0, 'neu': 0.563, 'pos': 0.437, 'compound': 0.5964}

In [59]:
sia.polarity_scores("the food was GOOD ")

{'neg': 0.0, 'neu': 0.452, 'pos': 0.548, 'compound': 0.5622}

In [60]:
sia.polarity_scores("I usually hate seafood but I liked this")

{'neg': 0.234, 'neu': 0.398, 'pos': 0.368, 'compound': 0.3291}

In [61]:
sia.polarity_scores("I usually hate seafood and I liked this")

{'neg': 0.352, 'neu': 0.381, 'pos': 0.267, 'compound': -0.2263}

### Loading positive and negative reviews

In [62]:
positiveReviewsFileName = "rt-polarity.pos"
with open (positiveReviewsFileName, 'r') as f:
    positiveReviews = f.readlines()

In [63]:
negativeReviewsFileName = "rt-polarity.neg"
with open (negativeReviewsFileName, 'r') as f:
    negativeReviews = f.readlines()

In [64]:
len(positiveReviews)

5331

In [65]:
len(negativeReviews)

5331

### Function for the sentiment analysis using vader

In [66]:
sia = vader.SentimentIntensityAnalyzer()
def vaderSentiment(review):
    return sia.polarity_scores(review)['compound']

In [67]:
review = "this is the best restaurant in the city"
vaderSentiment(review)

0.6369

### Printing negative reviews

In [68]:
[vaderSentiment(oneNegativeReview) for oneNegativeReview in negativeReviews]

[0.0258,
 0.4404,
 0.0,
 -0.25,
 0.0,
 0.4939,
 0.0,
 0.0,
 -0.34,
 -0.3612,
 -0.3678,
 0.397,
 -0.0384,
 -0.836,
 0.3818,
 -0.2565,
 0.4404,
 0.4199,
 0.0772,
 0.0,
 0.7346,
 -0.3559,
 0.2732,
 -0.0516,
 0.4939,
 0.4019,
 -0.5423,
 -0.8887,
 0.6068,
 -0.296,
 0.0772,
 0.0,
 0.5267,
 0.4939,
 -0.7845,
 -0.5865,
 0.0258,
 -0.2457,
 -0.5789,
 0.0,
 -0.25,
 -0.6808,
 0.4588,
 0.5574,
 0.802,
 -0.4767,
 0.6124,
 -0.4767,
 -0.7579,
 0.0,
 -0.5562,
 0.0516,
 0.6369,
 -0.4767,
 -0.5574,
 0.4404,
 0.8658,
 0.0,
 0.3477,
 0.5574,
 -0.8591,
 -0.5574,
 -0.5994,
 0.128,
 0.1154,
 0.34,
 0.2509,
 0.4404,
 -0.4767,
 0.0,
 0.0,
 -0.144,
 0.4215,
 0.0,
 0.2846,
 -0.5267,
 0.0,
 -0.0258,
 -0.2235,
 -0.4824,
 0.5095,
 -0.4215,
 0.4402,
 0.4019,
 -0.7269,
 0.0,
 0.875,
 0.4767,
 -0.3239,
 0.1779,
 0.8497,
 0.4404,
 0.0,
 0.3612,
 0.802,
 -0.1263,
 -0.3612,
 0.5994,
 0.0,
 0.0,
 -0.4391,
 -0.6369,
 0.34,
 0.3612,
 0.0,
 0.7715,
 -0.5096,
 -0.1263,
 -0.3612,
 -0.835,
 0.2732,
 -0.4019,
 -0.8201,
 -0.6486,


### Function that gives a dictionary with negatives and positives reviews

In [69]:
def getReviewSentiments(sentimentCalculator):
    negReviewResult = [sentimentCalculator(oneNegativeReview) for oneNegativeReview in negativeReviews]
    posReviewResult = [sentimentCalculator(onePositiveReview) for onePositiveReview in positiveReviews]
    return{'results-on-positive': posReviewResult, 'results-on-negative':negReviewResult}

In [70]:
vaderResults = getReviewSentiments(vaderSentiment)

In [71]:
vaderResults.keys()

dict_keys(['results-on-positive', 'results-on-negative'])

### Percentage of positives over negatives

In [72]:
pctTruePositive = float(sum(x > 0 for x in vaderResults['results-on-positive']))/len(vaderResults['results-on-positive'])

### Percentage of negatives over positives

In [73]:
pctTrueNegative = float(sum(x > 0 for x in vaderResults['results-on-negative']))/len(vaderResults['results-on-negative'])

In [74]:
positiveReviewsResult = vaderResults['results-on-positive']
negativeReviewsResult = vaderResults['results-on-negative']

### Printing overall accuracy

In [75]:
total_Accurate = float(sum(x > 0 for x in positiveReviewsResult)) + float(sum(x < 0 for x in negativeReviewsResult))
total = len(positiveReviewsResult) + len(negativeReviewsResult)
print("Overall accuracy = " + "%2.f" % (total_Accurate * 100/total) + "%")

Overall accuracy = 55%


### Function that gives us the accuracies of our model using vader

In [76]:
def runDiagnostics(reviewSentiments):
    positiveReviewsResult = reviewSentiments['results-on-positive']
    negativeReviewsResult = reviewSentiments['results-on-negative']
    
    pctTruePositive = float(sum(x > 0 for x in positiveReviewsResult))/len(positiveReviewsResult)
    pctTrueNegative = float(sum(x > 0 for x in negativeReviewsResult))/len(negativeReviewsResult)
    
    totalAccurate = float(sum(x > 0 for x in positiveReviewsResult)) + float(sum(x > 0 for x in negativeReviewsResult))
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    
    print("Accuracy on positive reviews = " + "%.2f" % (pctTruePositive*100) + "%")
    print("Accuracy on negative reviews = " + "%.2f" % (pctTrueNegative*100) + "%")
    print("Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%")

In [77]:
runDiagnostics(getReviewSentiments(vaderSentiment))

Accuracy on positive reviews = 69.44%
Accuracy on negative reviews = 42.26%
Overall accuracy = 55.85%


### Sentiwordnet for different meanings of a word (negative/positive) we are going to use the most common in our function ("[0]")

In [78]:
from nltk.corpus import sentiwordnet as swn

In [79]:
swn.senti_synsets('dog')

<filter at 0x1944e82d700>

In [80]:
list(swn.senti_synsets('dog'))[3]

SentiSynset('cad.n.01')

In [81]:
list(swn.senti_synsets('dog'))[3].neg_score()

1.0

In [82]:
from nltk.corpus import sentiwordnet as swn

### Function that uses the score of the most common meaning of the word. It is going to depend of the positive or negative score, the one that is higher, is the one we are going to use.

In [94]:
def superNaiveSentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        weight = 0.0
        try:
            common_meaning = list(swn.senti_synsets(word))[0]
            if common_meaning.pos_score() > common_meaning.neg_score():
                weight = weight + common_meaning.pos_score()
            elif common_meaning.pos_score() < common_meaning.neg_score():
                weight = weight - common_meaning.neg_score()
        except:
            numExceptions = numExceptions + 1
        reviewPolarity = reviewPolarity + weight
    return reviewPolarity          

In [95]:
runDiagnostics(getReviewSentiments(superNaiveSentiment))

Accuracy on positive reviews = 65.13%
Accuracy on negative reviews = 48.30%
Overall accuracy = 56.72%


### Defining our own stopwords.

In [96]:
from string import punctuation
from nltk.corpus import stopwords

In [97]:
stopwords = set(stopwords.words('english') + list(punctuation))

In [98]:
list(punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [88]:
stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

### Improved function that selects the best meaning of the word that could be positive or negative.

In [89]:
def naiveSentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        numMeanings = 0
        if word in stopwords:
            continue
        weight = 0.0
        try:
            for meaning in swn.senti_synsets(word):                  
                if meaning.pos_score() > meaning.neg_score():
                    weight = weight + (meaning.pos_score() - meaning.neg_score())
                    numMeanings += 1
                elif meaning.pos_score() < meaning.neg_score():
                    weight = weight - (meaning.neg_score() - meaning.pos_score())
                    numMeanings += 1
        except:
            numExceptions += 1        
        if numMeanings > 0:            
            reviewPolarity += (weight/numMeanings)
    return reviewPolarity

In [90]:
runDiagnostics(getReviewSentiments(naiveSentiment))

Accuracy on positive reviews = 75.56%
Accuracy on negative reviews = 56.05%
Overall accuracy = 65.80%
