# Compute NLTK and Transformers sentiment scores on labelled dataset

### References
- Transformers: https://huggingface.co/transformers/quicktour.html

### Datasets
- UCI: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences



In [1]:
import pandas as pd

### Read data

In [2]:
# Read datasets

def readFile(filepath, source):
    df = pd.read_csv('../data/raw/uci-sentiment/amazon_cells_labelled.txt', sep="NO_SEPARATORS", names=['Data'])
    df['GT'] = df.Data.apply(lambda x: x[-1:])
    df['Text'] = df.Data.apply(lambda x: x[0: len(x)-1])
    df['Source'] = source
    return df[['Source', 'Text', 'GT']]

amazon = readFile('../data/raw/uci-sentiment/amazon_cells_labelled.txt','amazon')
imdb = readFile('../data/raw/uci-sentiment/imdb_labelled.txt', 'imdb')
yelp = readFile('../data/raw/uci-sentiment/yelp_labelled.txt', 'yelp')
df = pd.concat([amazon, imdb, yelp])

display(df.shape)
df.head(3)

  return func(*args, **kwargs)


(3000, 3)

Unnamed: 0,Source,Text,GT
0,amazon,So there is no way for me to plug it in here i...,0
1,amazon,"Good case, Excellent value.\t",1
2,amazon,Great for the jawbone.\t,1


### NLTK sentiment analysis

In [3]:
%%time
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

CPU times: user 729 ms, sys: 46.5 ms, total: 776 ms
Wall time: 781 ms


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/srimal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
%time

sia = SentimentIntensityAnalyzer()

display(sia.polarity_scores("Wow, NLTK is really powerful!"))
display(sia.polarity_scores("absolutely really bad"))

df['NLTK'] = df['Text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
df.head(3)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.2 µs


{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

{'neg': 0.671, 'neu': 0.329, 'pos': 0.0, 'compound': -0.6214}

Unnamed: 0,Source,Text,GT,NLTK
0,amazon,So there is no way for me to plug it in here i...,0,-0.3535
1,amazon,"Good case, Excellent value.\t",1,0.8402
2,amazon,Great for the jawbone.\t,1,0.6249


### Transformers

In [5]:
from transformers import pipeline
import numpy as np

In [6]:
%%time

def tfScore(text, classifier):
    try:
        r = classifier(text)
        if r[0]['label'] == 'NEGATIVE':
            return -1.0 * r[0]['score']
        else:
            return r[0]['score']        
    except Exception as e:
        display(f'ERROR:')
        display(f'- Text:  "{text}"')
        display(f'- Exception:  "{e}"')
        return np.nan
        

def getTFScore(text, classifier):
    score = tfScore(text, classifier)
    return score

    
tfSentiment = pipeline('sentiment-analysis')

display(getTFScore('I feel horrible', tfSentiment))
display(getTFScore('I feel awesome', tfSentiment))
display(getTFScore('1', tfSentiment))


-0.9996577501296997

0.9998730421066284

0.9854032397270203

CPU times: user 1.15 s, sys: 308 ms, total: 1.46 s
Wall time: 12.2 s


In [7]:
%%time
df['Transformers'] = df['Text'].apply(lambda x: getTFScore(str(x), tfSentiment))


CPU times: user 3min 22s, sys: 1.71 s, total: 3min 24s
Wall time: 1min 42s


### Inspect text length

In [8]:
df['TextLength'] = df.Text.apply(lambda x: len(x))

## Save output

In [12]:
df.to_csv('../output/1-nltk-transformers.csv', index=False)