# Get NLTK and Transformers sentiment scores on labelled dataset

*References:*
- https://huggingface.co/transformers/quicktour.html

In [1]:
import pandas as pd

### Read data

In [2]:
# Read datasets
amazon = pd.read_csv('../data/raw/uci-sentiment/amazon_cells_labelled.txt', sep='\t', names=['Text', 'GT'])
imdb = pd.read_csv('../data/raw/uci-sentiment/imdb_labelled.txt', sep='\t', names=['Text', 'GT'])
yelp = pd.read_csv('../data/raw/uci-sentiment/yelp_labelled.txt', sep='\t', names=['Text', 'GT'])
df = pd.concat([amazon, imdb, yelp])
display(df.shape)
df.head(3)

(2748, 2)

Unnamed: 0,Text,GT
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1


### NLTK sentiment analysis

In [3]:
%time
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

Wall time: 0 ns


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\srimal.jayawardena\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
%time

sia = SentimentIntensityAnalyzer()

display(sia.polarity_scores("Wow, NLTK is really powerful!"))
display(sia.polarity_scores("absolutely really bad"))

df['NLTK'] = df['Text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
df.head(3)

Wall time: 0 ns


{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

{'neg': 0.671, 'neu': 0.329, 'pos': 0.0, 'compound': -0.6214}

Unnamed: 0,Text,GT,NLTK
0,So there is no way for me to plug it in here i...,0,-0.3535
1,"Good case, Excellent value.",1,0.8402
2,Great for the jawbone.,1,0.6249


### Transformers

In [5]:
from transformers import pipeline

In [6]:
%time

def tfScore(text, classifier):
    r = classifier(text)
    if r[0]['label'] == 'NEGATIVE':
        return -1.0 * r[0]['score']
    else:
        return r[0]['score']

    
tfSentiment = pipeline('sentiment-analysis')

display(tfScore('I feel horrible', tfSentiment))
display(tfScore('I feel awesome', tfSentiment))

df['Transformers'] = df['Text'].apply(lambda x: tfScore(str(x), tfSentiment))
df.head(3)

Wall time: 0 ns


HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /distilbert-base-uncased-finetuned-sst-2-english/resolve/main/config.json (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))


OSError: Can't load config for 'distilbert-base-uncased-finetuned-sst-2-english'. Make sure that:

- 'distilbert-base-uncased-finetuned-sst-2-english' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'distilbert-base-uncased-finetuned-sst-2-english' is the correct path to a directory containing a config.json file



## Save output

In [None]:
df.to_csv('../output/1-nltk-transformers.csv', index=False)