# Dataset to use

In [1]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\samen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
msg = 'This was a good movie.'

sid.polarity_scores(msg)


{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [3]:
msg = 'This was the best, most awesome movie EVER MADE!!!'

sid.polarity_scores(msg)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

# Use VADER to analyze Reviews

In [1]:
import numpy as np
import pandas as pd

url = 'https://drive.google.com/file/d/1kvoXXSuCz3LfyX9SpBlsZs-5nDO5Eypz/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path)

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
df.shape

(4999, 2)

In [5]:
df['sentiment'].value_counts()

negative    2531
positive    2468
Name: sentiment, dtype: int64

# Clean the data

In [6]:
df.dropna(inplace=True)

blanks = []

for i, lb, rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

df.drop(blanks, inplace=True)

# Adding Scores and Labels to the Dataframe

In [7]:
df['Scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

df.head()

Unnamed: 0,review,sentiment,Scores
0,One of the other reviewers has mentioned that ...,positive,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co..."
1,A wonderful little production. <br /><br />The...,positive,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co..."
2,I thought this was a wonderful way to spend ti...,positive,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co..."
3,Basically there's a family where a little boy ...,negative,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co..."


In [8]:
df['compound'] = df['Scores'].apply(lambda score_dict: score_dict['compound'])

df.head()

Unnamed: 0,review,sentiment,Scores,compound
0,One of the other reviewers has mentioned that ...,positive,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co...",-0.9951
1,A wonderful little production. <br /><br />The...,positive,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co...",0.9641
2,I thought this was a wonderful way to spend ti...,positive,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co...",0.9605
3,Basically there's a family where a little boy ...,negative,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co...",-0.9213
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co...",0.9744


In [9]:
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >= 0 else 'neg')
df.head()

Unnamed: 0,review,sentiment,Scores,compound,comp_score
0,One of the other reviewers has mentioned that ...,positive,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co...",-0.9951,neg
1,A wonderful little production. <br /><br />The...,positive,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co...",0.9641,pos
2,I thought this was a wonderful way to spend ti...,positive,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co...",0.9605,pos
3,Basically there's a family where a little boy ...,negative,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co...",-0.9213,neg
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co...",0.9744,pos


# Testing the model

In [10]:
# Write a review as one continuous string (multiple sentences are ok)
review = 'The shoes I brought were amazing.'

# Obtain the sid scores for your review
sid.polarity_scores(review)


{'neg': 0.0, 'neu': 0.513, 'pos': 0.487, 'compound': 0.5859}

In [11]:
review='The mobile phone I bought was the WORST and very BAD'

# Obtain the sid scores for your review
sid.polarity_scores(review)

{'neg': 0.539, 'neu': 0.461, 'pos': 0.0, 'compound': -0.8849}