# Sentiment Analysis

In [1]:
%load_ext autoreload
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import fastText
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
tqdm.pandas()

In [2]:
# Read test data
df = pd.read_csv('data/sst/sst_test.txt', sep='\t', header=None,
                   names=['truth', 'sentence'],
                  )
df['truth'] = df['truth'].str.replace('__label__', '')
df['truth'] = df['truth'].astype(int).astype('category')
print(df.dtypes)
df.head()

truth       category
sentence      object
dtype: object


Unnamed: 0,truth,sentence
0,3,Effective but too-tepid biopic
1,4,If you sometimes like to go to the movies to h...
2,5,"Emerges as something rare , an issue movie tha..."
3,3,The film provides some great insight into the ...
4,5,Offers that rare combination of entertainment ...


In [3]:
def print_accuracy(df, pred_column):
    "Print f1 score and accuracy after making predictions"
    f1_macro = f1_score(df['truth'], df[pred_column], average='macro')
    acc = accuracy_score(df['truth'], df[pred_column])*100
    return f1_macro, acc

## 1 - TextBlob

In [4]:
def textblob_score(sentence):
    return TextBlob(sentence).sentiment.polarity

In [5]:
# Convert textblob sentiment score
df['textblob_score'] = df['sentence'].apply(textblob_score)
# Convert float score to category based on binning
df['textblob_pred'] = pd.cut(df['textblob_score'], bins=5, labels=[1, 2, 3, 4, 5])
df = df.drop('textblob_score', axis=1)
df.head()

Unnamed: 0,truth,sentence,textblob_pred
0,3,Effective but too-tepid biopic,4
1,4,If you sometimes like to go to the movies to h...,4
2,5,"Emerges as something rare , an issue movie tha...",4
3,3,The film provides some great insight into the ...,4
4,5,Offers that rare combination of entertainment ...,3


In [6]:
# Get model accuracy and f1 score
acc = print_accuracy(df, 'textblob_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.2468141571266554
Accuracy: 28.3710407239819


## 2 - VADER

In [7]:
vader = SentimentIntensityAnalyzer()
def score_vader(sentence, vader):
    return vader.polarity_scores(sentence)['compound']

In [8]:
# Calculate Vader sentiment score
df['vader_score'] = df['sentence'].apply(lambda x: score_vader(x, vader))
# Convert float score to category based on binning
df['vader_pred'] = pd.cut(df['vader_score'], bins=5, labels=[1, 2, 3, 4, 5])
df = df.drop('vader_score', axis=1)
df.head()

Unnamed: 0,truth,sentence,textblob_pred,vader_pred
0,3,Effective but too-tepid biopic,4,4
1,4,If you sometimes like to go to the movies to h...,4,5
2,5,"Emerges as something rare , an issue movie tha...",4,5
3,3,The film provides some great insight into the ...,4,5
4,5,Offers that rare combination of entertainment ...,3,4


In [9]:
# Get model accuracy and f1 score
acc = print_accuracy(df, 'vader_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.31297326018199634
Accuracy: 31.538461538461537


## 3 - FastText

### 3.1 - Pretrained Yelp model

In [10]:
# Load fastText yelp review (5-class) trained model
ft_model = fastText.load_model('models/fasttext/fasttext_yelp_review_full.ftz')

In [11]:
def fasttext_score(sentence):
    labels, probabilities = ft_model.predict(sentence, 1) # Predict just the top label, hence 1
    pred = int(labels[0][-1])
    return pred

In [12]:
df['fasttext_pred'] = df['sentence'].str.lower().apply(fasttext_score)
df.head()

Unnamed: 0,truth,sentence,textblob_pred,vader_pred,fasttext_pred
0,3,Effective but too-tepid biopic,4,4,3
1,4,If you sometimes like to go to the movies to h...,4,5,4
2,5,"Emerges as something rare , an issue movie tha...",4,5,5
3,3,The film provides some great insight into the ...,4,5,5
4,5,Offers that rare combination of entertainment ...,3,4,5


In [13]:
# Get model accuracy and f1 score
acc = print_accuracy(df, 'fasttext_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.3327246571108574
Accuracy: 33.167420814479634


### 3.2 - Train FastText on SST data

In [14]:
# We train using fastText's command line utility using the below command
!~/fastText-0.1.0/fasttext supervised -input data/sst/sst_train.txt -output models/sst -wordNgrams 3 -lr 0.5 -epoch 100

Read 0M words
Number of words:  18279
Number of labels: 5
Progress: 100.0%  words/sec/thread: 873255  lr: 0.000000  loss: 0.060957  eta: 0h0m .2%  words/sec/thread: 875780  lr: 0.464096  loss: 0.789396  eta: 0h0m m 14.0%  words/sec/thread: 877877  lr: 0.429843  loss: 0.416243  eta: 0h0m 0h0m m 19.2%  words/sec/thread: 877510  lr: 0.404039  loss: 0.311001  eta: 0h0m 20.9%  words/sec/thread: 877575  lr: 0.395529  loss: 0.293991  eta: 0h0m 26.0%  words/sec/thread: 874859  lr: 0.369924  loss: 0.229703  eta: 0h0m 0m 0.192080  eta: 0h0m %  words/sec/thread: 873911  lr: 0.345752  loss: 0.178180  eta: 0h0m 33.8%  words/sec/thread: 870264  lr: 0.330814  loss: 0.158771  eta: 0h0m 872976  lr: 0.322727  loss: 0.156234  eta: 0h0m 38.6%  words/sec/thread: 869093  lr: 0.307001  loss: 0.146085  eta: 0h0m 39.8%  words/sec/thread: 869592  lr: 0.300972  loss: 0.141149  eta: 0h0m 0m   words/sec/thread: 867860  lr: 0.282846  loss: 0.127444  eta: 0h0m 47.8%  words/sec/thread: 867311  lr: 0.261128  loss: 0.1

In [15]:
ft_model = fastText.load_model('models/fasttext/sst.bin')
df['fasttext_pred'] = df['sentence'].str.lower().apply(fasttext_score)
# Get model accuracy and f1 score
acc = print_accuracy(df, 'fasttext_pred')
print("Macro F1-score: {}\nAccuracy: {}".format(acc[0], acc[1]))

Macro F1-score: 0.37359276004432385
Accuracy: 39.95475113122172
