# Experiments with VADER

This notebook documents the initial experiments done with the VADER sentiment analysis tool. 
 
It also contains the code used for the prediction of SemEval datasets sentiment labels.

In [1]:
import sys
sys.path.append('/home/rafael/Projects/vaderSentiment-master/')

import re
import os
import pandas as pd
import numpy as np

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import normalize

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [15]:
analyser = SentimentIntensityAnalyzer()

# Maps compound scores to sentiment labels within a given threshold
# returns 0 for negative, 1 for neutral and 2 for positive
def compound_threshold(x, threshold=0.05):
        senti = 0
        if x >= threshold:
            senti = 2
        elif x <= -threshold:
            senti = 0
        else:
            senti = 1
        return senti
    
# Calculates the sentiment compound score for a list of sentences
def score_compound(text, threshold=0.05):
    
    scores = []
    for sentence in text:
        compound = analyser.polarity_scores(sentence)['compound']
        senti_score = compound_threshold(compound)
        scores.append(senti_score)
        
    return scores

## Initial testing

In [3]:
sentence = "No, no, no. This guy is a good president."
analyser.polarity_scores(sentence)

{'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'compound': 0.4404}

In [4]:
sentence = "This guy is no good president."
analyser.polarity_scores(sentence)

{'neg': 0.325, 'neu': 0.675, 'pos': 0.0, 'compound': -0.3412}

In [5]:
sentence = "This guy is a good president and a bad person"
analyser.polarity_scores(sentence)

{'neg': 0.243, 'neu': 0.556, 'pos': 0.201, 'compound': -0.1531}

In [6]:
sentence = "This guy is a bad person and a good president"
analyser.polarity_scores(sentence)

{'neg': 0.243, 'neu': 0.556, 'pos': 0.201, 'compound': -0.1531}

In [7]:
sentence = "This guy is a bad person and a good president and a good father"
analyser.polarity_scores(sentence)

{'neg': 0.172, 'neu': 0.542, 'pos': 0.286, 'compound': 0.3182}

In [8]:
sentence = "This guy is a bad person but a good president and a good father"
analyser.polarity_scores(sentence)

{'neg': 0.107, 'neu': 0.525, 'pos': 0.368, 'compound': 0.7543}

In [9]:
sentence = "This is what you call a good person?"
analyser.polarity_scores(sentence)

{'neg': 0.0, 'neu': 0.707, 'pos': 0.293, 'compound': 0.4404}

In [10]:
sentence = "In this rainy day I went to the pool."
analyser.polarity_scores(sentence)

{'neg': 0.14, 'neu': 0.86, 'pos': 0.0, 'compound': -0.0772}

## Scoring vader ground-truth dataset

In [12]:
ground_truth_dataset_path = "~/Datasets/vader_data/tweets_GroundTruth.txt"
loaded_df = pd.read_csv(ground_truth_dataset_path, 
                            sep='\t',
                            header=None,
                            names=['sentiment', 'text']
                           )

In [13]:
df = loaded_df
df.head()

Unnamed: 0,sentiment,text
1,2.726316,Somehow I was blessed with some really amazing...
2,1.443299,Yay. Another good phone interview.
3,2.873684,We were 17 deep last night &amp; the love was ...
4,2.857143,"LMAO, AMAZING!"
5,-2.154639,Two words that should die this year: Sexting a...


In [14]:
df.shape

(4200, 2)

In [27]:
vader_text_ds = np.array(df['text'])

In [29]:
vader_labels_ds = np.array(df['sentiment'])
target = np.array([compound_threshold(x) for x in vader_labels_ds])

In [33]:
# Getting the sentences compound_score and respective label
vader_scores_compound = score_compound(text)
label_score = [compound_threshold(x) for x in vader_scores_compound]

# Accuracy computation
accuracy_compound = accuracy_score(target, label_score)
print(accuracy_compound)

0.9264285714285714


In [40]:
cm = confusion_matrix(target, label_score)
print('----------------Confusion matrix')
print(cm)
print()
print('----------------Confusion matrix report')
print(classification_report(target, label_score))

----------------Confusion matrix
[[1167   72   48]
 [   5   31    4]
 [  31  149 2693]]

----------------Confusion matrix report
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      1287
           1       0.12      0.78      0.21        40
           2       0.98      0.94      0.96      2873

    accuracy                           0.93      4200
   macro avg       0.69      0.87      0.70      4200
weighted avg       0.97      0.93      0.95      4200



## Classification of movie reviews with VADER

In [3]:
imdb_path = "/home/rafael/Datasets/IMDB"
reviews_train = []
for line in open(imdb_path + '/movie_data/full_train.txt', 'r', encoding='utf8'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open(imdb_path + '/movie_data/full_test.txt', 'r', encoding='utf8'):
    
    reviews_test.append(line.strip())


In [4]:
reviews_train[5]

"This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7.7/10 from...<br />

## 1. Simple data cleanup using regex

In [5]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [10]:
reviews_train_clean[5]

'this isnt the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robins new love of the thriller but this isnt a thriller per se this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until williams character gets close to achieving his goal i must say that i was highly entertained though this movie fails to teach guide inspect or amuse it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective in other words it felt real and i was able to subscribe to the premise of the story all in all its worth a watch though its definitely not friday saturday night fare it rates a   from the fiend '

As labels are evenly balanced, the first 12.5k are movie reviews expressing positive sentiment, and the rest is expressing negative sentiments.

In [6]:
target = [1 if i < 12500 else 0 for i in range(25000)]

In [7]:
len(reviews_train_clean)

25000

## 2. Sentiment classification using VADER tool

VADER takes as input direct sentences.

No complex text pre-processing is needed.

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

scores = analyser.polarity_scores("Today was a good day!!")

In [8]:
compound_threshold(scores['compound'])

2

In [9]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

In [10]:
from sklearn.metrics import accuracy_score

In [11]:
def vader_predict_sentiments(X):
    scores = []
    for sentence in X:
        score = analyser.polarity_scores(sentence)
        compound = 1 if score['compound'] >= 0.05 else 0
        scores.append(compound)
    return scores

In [12]:
vader_scores = vader_predict_sentiments(reviews_train_clean)
accuracy_clean = accuracy_score(target, vader_scores)

Accuracy with data clean up

In [13]:
print(accuracy_clean)

0.70028


In [14]:
vader_scores_no_cleanup = vader_predict_sentiments(reviews_train)
accuracy_noclean = accuracy_score(target, vader_scores_no_cleanup)

Accuracy without data cleaning

In [15]:
print(accuracy_noclean)

0.69592


 ## SEMEVAL twitter sentiment analysis tasks

In [11]:
def load_semeval_tweets(file):
    tweets = []
    labels = []
    classes = {
        'negative':0,
        'neutral':1,
        'positive':2
    }
    
    with open(file, 'r') as input_file:
        for line in input_file:
            sequences = re.split(r'\t', line)
            tweets.append(sequences[1])
            labels.append(classes[sequences[0]])
    return tweets, labels

def write_output(file, scores, labels):
    with open(file, 'w') as output_file:
        for idx, score in enumerate(scores):
            output_file.write('\t'.join([str(score),str(labels[idx])])+'\n')
    

In [16]:
semeval_test_path = '/home/rafael/Datasets/semeval/data/clean'
semeval_output_path = '/home/rafael/Datasets/semeval/results/fulldata'

semeval_years = ['2013', '2014','2015','2016','2017',]


for year in semeval_years:
    
    test_file = semeval_test_path + '/test' + year + '.tsv'
    output_file = semeval_output_path + '/vader_results' + year + '.tsv'
    
    tweets, labels = load_semeval_tweets(test_file)
    
    print('Tweets ',len(tweets))
    print('Labels ', len(labels))
    
    scores = score_compound(tweets, 0.05)
    write_output(output_file, scores, labels)
    print('Output file generated for Semeval-{} testset with predictions for {} tweets.'.format(year, len(tweets)))


Tweets  3547
Labels  3547
Output file generated for Semeval-2013 testset with predictions for 3547 tweets.
Tweets  1853
Labels  1853
Output file generated for Semeval-2014 testset with predictions for 1853 tweets.
Tweets  2390
Labels  2390
Output file generated for Semeval-2015 testset with predictions for 2390 tweets.
Tweets  20632
Labels  20632
Output file generated for Semeval-2016 testset with predictions for 20632 tweets.
Tweets  12284
Labels  12284
Output file generated for Semeval-2017 testset with predictions for 12284 tweets.


#### The assessment for vader performance over the SemEval2017 was conducted on another notebook.