In [1]:
%load_ext autoreload
%autoreload 2

from pysentimiento import create_analyzer
from textblob import TextBlob
from datasets import load_dataset
import tweetnlp

model = tweetnlp.load_model('sentiment')  # Or `model = tweetnlp.Sentiment()`



benchmark_datasets = {
    "sentiment": ["stanfordnlp/sst2", "takala/financial_phrasebank"]
}

analyzer = create_analyzer("sentiment", lang="en")


ds = load_dataset(benchmark_datasets["sentiment"][0])

ds

2024-06-15 03:09:52,733	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-06-15 03:09:52,881	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [2]:
ds["test"].features

{'idx': Value(dtype='int32', id=None),
 'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None)}

In [3]:
from tqdm.auto import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=True)

vader = SentimentIntensityAnalyzer()



pysentimiento_outs = analyzer.predict(ds["validation"]["sentence"])
textblob_outs = [TextBlob(x).sentiment.polarity for x in tqdm(ds["validation"]["sentence"])]
vader_outs = [vader.polarity_scores(x) for x in tqdm(ds["validation"]["sentence"])]
stanza_outs = nlp(ds["validation"]["sentence"])

tweetnlp_outs = model.predict(ds["validation"]["sentence"])


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /users/jmperez/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: sentiment
INFO:stanza:Done loading processors!


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 872
  Batch size = 32


  0%|          | 0/872 [00:00<?, ?it/s]

  0%|          | 0/872 [00:00<?, ?it/s]

In [13]:
tweetnlp_outs[0]

{'label': 'positive'}

In [14]:
import random
preds = {}

# Get NEG or POS --ignore NEU
preds["pysentimiento"] = ["negative" if x.probas["NEG"] > x.probas["POS"] else "positive" for x in pysentimiento_outs]

preds["textblob"] = ["negative" if x < 0 else "positive" for x in textblob_outs]

preds["vader"] = ["negative" if x["neg"] > x["pos"] else "positive" for x in vader_outs]

def get_stanza_sentiment(x):
    if x.sentiment == 0:
        return "negative"
    elif x.sentiment == 2:
        return "positive"
    else:
        # Flip a coin
        if random.random() > 0.5:
            return "positive"
        else:
            return "negative"

preds["stanza"] = [get_stanza_sentiment(x) for x in stanza_outs.sentences]

def get_tweetnlp_sentiment(x):

    if x["label"] in {"positive", "negative"}:
        return x["label"]
    else:
        # Flip a coin
        if random.random() > 0.5:
            return "positive"
        else:
            return "negative"

preds["tweetnlp"] = [get_tweetnlp_sentiment(x) for x in tweetnlp_outs]

In [15]:
from sklearn.metrics import classification_report

id2label = ["negative", "positive"]
label2id = {label: i for i, label in enumerate(id2label)}


results = {}
for name, pred in preds.items():
    print(name)
    true_labels = ds["validation"]["label"]
    pred_labels = [label2id[x] for x in pred]

    ret = classification_report(true_labels, pred_labels, target_names=id2label, output_dict=True)

    res = {
        "Negative F1": ret["negative"]["f1-score"],
        "Positive F1": ret["positive"]["f1-score"],
        "Macro F1": ret["macro avg"]["f1-score"],
        "Macro Precision": ret["macro avg"]["precision"],
        "Macro Recall": ret["macro avg"]["recall"],
    }
    results[name] = res

pysentimiento
textblob
vader
stanza
tweetnlp


In [16]:
import pandas as pd

df = pd.DataFrame(results).T


df * 100

Unnamed: 0,Negative F1,Positive F1,Macro F1,Macro Precision,Macro Recall
pysentimiento,87.573964,88.320356,87.94716,87.990882,87.931506
textblob,50.909091,70.110701,60.509896,65.894397,62.418961
vader,51.88537,70.490287,61.187828,66.501553,62.998863
stanza,84.976526,85.650224,85.313375,85.322608,85.307738
tweetnlp,80.590238,80.1854,80.387819,80.472006,80.435927



## Financial Phrasebank

In [17]:


ds = load_dataset("takala/financial_phrasebank", "sentences_66agree")

ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4217
    })
})

In [18]:
id2label = ds["train"].features["label"].names

label2id = {label: i for i, label in enumerate(id2label)}

id2label, label2id

(['negative', 'neutral', 'positive'],
 {'negative': 0, 'neutral': 1, 'positive': 2})

In [19]:
pysentimiento_outs = analyzer.predict(ds["train"]["sentence"])
textblob_outs = [TextBlob(x).sentiment.polarity for x in tqdm(ds["train"]["sentence"])]
vader_outs = [vader.polarity_scores(x) for x in tqdm(ds["train"]["sentence"])]
stanza_outs = nlp(ds["train"]["sentence"])
tweetnlp_outs = model.predict(ds["train"]["sentence"])

Map:   0%|          | 0/4217 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4217
  Batch size = 32


  0%|          | 0/4217 [00:00<?, ?it/s]

  0%|          | 0/4217 [00:00<?, ?it/s]

In [20]:
import random
preds = {}

# Get NEG or POS --ignore NEU
pysent_mask = {"NEG": "negative", "POS": "positive", "NEU": "neutral"}

preds["pysentimiento"] = [pysent_mask[x.output] for x in pysentimiento_outs]

def get_textblob_sentiment(x, neutral_threshold=0.1):
    if x < -neutral_threshold:
        return "negative"
    elif x > neutral_threshold:
        return "positive"
    else:
        return "neutral"

preds["textblob"] = [get_textblob_sentiment(x) for x in textblob_outs]

def get_vader_sentiment(x):
    sents = ["neg", "neu", "pos"]

    # get argmax
    max_sent = max(range(len(sents)), key=lambda i: x[sents[i]])

    return id2label[max_sent]

preds["vader"] = [get_vader_sentiment(x) for x in vader_outs]


preds["stanza"] = [id2label[x.sentiment] for x in stanza_outs.sentences]

preds["tweetnlp"] = [x["label"] for x in tweetnlp_outs]

In [21]:
from sklearn.metrics import classification_report


results = {}
for name, pred in preds.items():
    print(name)
    true_labels = ds["train"]["label"]
    pred_labels = [label2id[x] for x in pred]

    ret = classification_report(true_labels, pred_labels, target_names=id2label, output_dict=True)

    res = {
        "Negative F1": ret["negative"]["f1-score"],
        "Positive F1": ret["positive"]["f1-score"],
        "Macro F1": ret["macro avg"]["f1-score"],
        "Macro Precision": ret["macro avg"]["precision"],
        "Macro Recall": ret["macro avg"]["recall"],
    }
    results[name] = res

pysentimiento
textblob
vader
stanza
tweetnlp


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
import pandas as pd

df = pd.DataFrame(results).T

df

Unnamed: 0,Negative F1,Positive F1,Macro F1,Macro Precision,Macro Recall
pysentimiento,0.637441,0.609615,0.682352,0.750389,0.645194
textblob,0.289238,0.377609,0.448512,0.466919,0.439714
vader,0.0,0.00341,0.251322,0.333713,0.33351
stanza,0.313076,0.324654,0.441454,0.463273,0.444095
tweetnlp,0.513834,0.466513,0.588266,0.73207,0.548265
