# VADER for Sentiment Analysis

In [1]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import fastText
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# Read test data
df = pd.read_csv('data/sst/sst_test.txt', sep='\t', header=None,
                   names=['truth', 'sentence'],
                  )
df['truth'] = df['truth'].str.replace('__label__', '')
df['truth'] = df['truth'].astype(int).astype('category')
print(df.dtypes)
df.head()

truth       category
sentence      object
dtype: object


Unnamed: 0,truth,sentence
0,3,Effective but too-tepid biopic
1,4,If you sometimes like to go to the movies to h...
2,5,"Emerges as something rare , an issue movie tha..."
3,3,The film provides some great insight into the ...
4,5,Offers that rare combination of entertainment ...


In [3]:
def print_accuracy(df, pred_column):
    "Print f1 score and accuracy after making predictions"
    f1_macro = f1_score(df['truth'], df[pred_column], average='macro')
    acc = accuracy_score(df['truth'], df[pred_column])*100
    return f1_macro, acc

## 1 - VADER

In [4]:
vader = SentimentIntensityAnalyzer()
def score_vader(sentence, vader):
    return vader.polarity_scores(sentence)['compound']

In [5]:
# Calculate Vader sentiment score
df['vader_score'] = df['sentence'].apply(lambda x: score_vader(x, vader))
# Convert float score to category based on binning
df['vader_pred'] = pd.cut(df['vader_score'], bins=5, labels=[1, 2, 3, 4, 5])
df = df.drop('vader_score', axis=1)
df.head()

Unnamed: 0,truth,sentence,vader_pred
0,3,Effective but too-tepid biopic,4
1,4,If you sometimes like to go to the movies to h...,5
2,5,"Emerges as something rare , an issue movie tha...",5
3,3,The film provides some great insight into the ...,5
4,5,Offers that rare combination of entertainment ...,4


In [6]:
# Get model accuracy and f1 score
acc = print_accuracy(df, 'vader_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.31297326018199634
Accuracy: 31.538461538461537


## 2 - TextBlob

In [7]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity

In [8]:
# Convert textblob sentiment score
df['textblob_score'] = df['sentence'].apply(textblob_score)
# Convert float score to category based on binning
df['textblob_pred'] = pd.cut(df['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
df = df.drop('textblob_score', axis=1)
df.head()

Unnamed: 0,truth,sentence,vader_pred,textblob_pred
0,3,Effective but too-tepid biopic,4,4
1,4,If you sometimes like to go to the movies to h...,5,4
2,5,"Emerges as something rare , an issue movie tha...",5,4
3,3,The film provides some great insight into the ...,5,4
4,5,Offers that rare combination of entertainment ...,4,3


In [9]:
# Get model accuracy and f1 score
acc = print_accuracy(df, 'textblob_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.2468141571266554
Accuracy: 28.3710407239819


## 3 - FastText

### 3.1 - Pretrained Yelp model

In [10]:
# Load fastText yelp review (5-class) trained model
ft_model = fastText.load_model('models/fasttext_yelp_review_full.ftz')

In [11]:
def fasttext_score(sentence):
    labels, probabilities = ft_model.predict(sentence, 1) # Predict just the top label, hence 1
    pred = int(labels[0][-1])
    return pred

In [12]:
df['fasttext_pred'] = df['sentence'].str.lower().apply(fasttext_score)
df.head()

Unnamed: 0,truth,sentence,vader_pred,textblob_pred,fasttext_pred
0,3,Effective but too-tepid biopic,4,4,3
1,4,If you sometimes like to go to the movies to h...,5,4,4
2,5,"Emerges as something rare , an issue movie tha...",5,4,5
3,3,The film provides some great insight into the ...,5,4,5
4,5,Offers that rare combination of entertainment ...,4,3,5


In [13]:
# Get model accuracy and f1 score
acc = print_accuracy(df, 'fasttext_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.3327246571108574
Accuracy: 33.167420814479634


### 3.2 - Train FastText on SST data

In [21]:
# We train using fastText's command line utility using the below command
!~/fastText-0.1.0/fasttext supervised -input data/sst/sst_train.txt -output models/sst -wordNgrams 3 -lr 0.5 -epoch 100

Read 0M words
Number of words:  18279
Number of labels: 5
Progress: 100.0%  words/sec/thread: 881292  lr: 0.000000  loss: 0.045543  eta: 0h0m -14m 1.411420  eta: 0h0m   loss: 1.218768  eta: 0h0m 29.6%  words/sec/thread: 879817  lr: 0.351924  loss: 0.149617  eta: 0h0m m %  words/sec/thread: 879638  lr: 0.175508  loss: 0.066052  eta: 0h0m 66.7%  words/sec/thread: 879662  lr: 0.166283  loss: 0.065111  eta: 0h0m   loss: 0.063804  eta: 0h0m 95.6%  words/sec/thread: 880907  lr: 0.021831  loss: 0.047272  eta: 0h0m 


In [22]:
ft_model = fastText.load_model('models/sst.bin')
df['fasttext_pred'] = df['sentence'].str.lower().apply(fasttext_score)
# Get model accuracy and f1 score
acc = print_accuracy(df, 'fasttext_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.3774494026440759
Accuracy: 40.13574660633484
