In [1]:
%load_ext autoreload
%autoreload 2

import random
import tweetnlp
import stanza
from tqdm.auto import tqdm
from pysentimiento import create_analyzer
from textblob import TextBlob
from datasets import load_dataset, ClassLabel
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sent_eval = load_dataset("SetFit/SentEval-CR")["test"]
sent_eval = sent_eval.rename_column("text", "sentence")

sent_eval = sent_eval.cast_column("label", ClassLabel(2, names=["negative", "positive"]))

benchmark_datasets = [
    ("sst2", load_dataset("stanfordnlp/sst2")["validation"]),
    ("financial_phrasebank", load_dataset("takala/financial_phrasebank", "sentences_66agree")["train"]),
    ("SentEval-CR", sent_eval),
]

#pysentimient + tweetnlp + stanza

model = tweetnlp.load_model('sentiment')  # Or `model = tweetnlp.Sentiment()`
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment', tokenize_no_ssplit=True)
vader = SentimentIntensityAnalyzer()

def pysentimiento_analyzer(dataset):
    analyzer = create_analyzer("sentiment", lang="en")
    id2label = dataset.features["label"].names

    outs = analyzer.predict(dataset["sentence"])

    if len(id2label) == 2:
        # Only positive/negative
        return ["negative" if x.probas["NEG"] > x.probas["POS"] else "positive" for x in outs]
    else:
        translation = {"NEU": "neutral", "POS": "positive", "NEG": "negative"}
        return [translation[x.output] for x in outs]

def stanza_analyzer(dataset):
    id2label = dataset.features["label"].names
    outs = nlp(dataset["sentence"])

    def _get_sentiment(x):
        if x.sentiment == 0:
            return "negative"
        elif x.sentiment == 2:
            return "positive"
        elif len(id2label) == 2:
            # Flip a coin
            if random.random() > 0.5:
                return "positive"
            else:
                return "negative"
        else:
            return "neutral"

    return [_get_sentiment(x) for x in outs.sentences]

def tweetnlp_analyzer(dataset):
    id2label = dataset.features["label"].names
    outs = model.predict(dataset["sentence"])
    def get_tweetnlp_sentiment(x):
        if x["label"] in {"positive", "negative"}:
            return x["label"]
        elif len(id2label) == 2:
            # Flip a coin
            if random.random() > 0.5:
                return "positive"
            else:
                return "negative"
        else:
            return "neutral"

    return [get_tweetnlp_sentiment(x) for x in outs]

def textblob_analyzer(dataset, threshold=0.1):
    id2label = dataset.features["label"].names
    outs = [TextBlob(x).sentiment.polarity for x in dataset["sentence"]]

    def get_textblob_sentiment(x):
        if len(id2label) == 2:
            if x > 0:
                return "positive"
            else:
                return "negative"
        else:
            if x > threshold:
                return "positive"
            elif x < -threshold:
                return "negative"
            else:
                return "neutral"

    return [get_textblob_sentiment(x) for x in outs]

def vader_analyzer(dataset):
    id2label = dataset.features["label"].names
    outs = [vader.polarity_scores(x) for x in dataset["sentence"]]

    def get_vader_sentiment(x):
        if len(id2label) == 2:
            if x["pos"] > x["neg"]:
                return "positive"
            else:
                return "negative"
        else:
            labels = ["neg", "neu", "pos"]

            # get argmax
            max_sent = max(range(len(labels)), key=lambda i: x[labels[i]])

            return id2label[max_sent]
    return [get_vader_sentiment(x) for x in outs]

analyzers = {
    "pysentimiento": pysentimiento_analyzer,
    "tweetnlp": tweetnlp_analyzer,
    "stanza": stanza_analyzer,
    "textblob": textblob_analyzer,
    "vader": vader_analyzer
}


2024-06-15 18:40:30,800	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-06-15 18:40:30,953	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Repo card metadata block was not found. Setting CardData to empty.
2024-06-15 18:40:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-06-15 18:40:51 INFO: Downloaded file to /users/jmperez/stanza_resources/resources.json
INFO:stanza:Downloaded file to /users/jmperez/stanza_resources/resources.json
2024-06-15 18:40:52 INFO: Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

INFO:stanza:Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | combined       |
| mwt       | combined       |
| sentiment | sstplus_charlm |

2024-06-15 18:40:52 INFO: Using device: cuda
INFO:stanza:Using device: cuda
2024-06-15 18:40:52 INFO: Loading: tokenize
INFO:stanza:Loading: tokenize
2024-06-15 18:40:52 INFO: Loading: mwt
INFO:stanza:Loading: mwt
2024-06-15 18:40:52 INFO: Loading: sentiment
INFO:stanza:Loading: sentiment
2024-06-15 18:40:53 INFO: Done loading processors!
INFO:stanza:Done loading processors!


In [2]:
from sklearn.metrics import classification_report

results = []

for ds_name, dataset in tqdm(benchmark_datasets):
    preds = {
        k: analyzer(dataset) for k, analyzer in analyzers.items()
    }
    id2label = dataset.features["label"].names
    label2id = {v: k for k, v in enumerate(id2label)}


    for name, pred in tqdm(list(preds.items())):
        print(name)
        true_labels = dataset["label"]
        pred_labels = [label2id[x] for x in pred]

        ret = classification_report(true_labels, pred_labels, target_names=id2label, output_dict=True)

        res = {
            "Model": name,
            "Dataset": ds_name,
            "Macro F1": ret["macro avg"]["f1-score"],
            "Macro Precision": ret["macro avg"]["precision"],
            "Macro Recall": ret["macro avg"]["recall"],
        }

        results.append(res)


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 872
  Batch size = 32


  0%|          | 0/5 [00:00<?, ?it/s]

pysentimiento
tweetnlp
stanza
textblob
vader


loading configuration file https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis/resolve/main/config.json from cache at /users/jmperez/.cache/huggingface/transformers/cb09766f7ba60b5f7a1bb640617b24f1499c4a6f3ab160c4a0ac171e3a377c68.7ee19fe8f9636b36c7013d97d34dd5c9302e13db4f1f04a011fe48ac1474b216
Model config RobertaConfig {
  "_name_or_path": "finiteautomata/bertweet-base-sentiment-analysis",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads":

Map:   0%|          | 0/4217 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4217
  Batch size = 32


  0%|          | 0/5 [00:00<?, ?it/s]

pysentimiento
tweetnlp
stanza
textblob
vader


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis/resolve/main/config.json from cache at /users/jmperez/.cache/huggingface/transformers/cb09766f7ba60b5f7a1bb640617b24f1499c4a6f3ab160c4a0ac171e3a377c68.7ee19fe8f9636b36c7013d97d34dd5c9302e13db4f1f04a011fe48ac1474b216
Model config RobertaConfig {
  "_name_or_path": "finiteautomata/bertweet-base-sentiment-analysis",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2

Map:   0%|          | 0/753 [00:00<?, ? examples/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 753
  Batch size = 32


  0%|          | 0/5 [00:00<?, ?it/s]

pysentimiento
tweetnlp
stanza
textblob
vader


In [3]:
benchmark_datasets[-1][-1].features["label"]

ClassLabel(names=['negative', 'positive'], id=None)

In [4]:
import pandas as pd

df = pd.DataFrame(results)

(df.set_index(["Dataset", "Model"]) * 100).round(2)[["Macro Precision", "Macro Recall", "Macro F1"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Macro Precision,Macro Recall,Macro F1
Dataset,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sst2,pysentimiento,87.99,87.93,87.95
sst2,tweetnlp,81.36,81.26,81.18
sst2,stanza,83.86,83.8,83.81
sst2,textblob,66.5,66.02,65.86
sst2,vader,67.17,66.85,66.77
financial_phrasebank,pysentimiento,75.04,64.52,68.24
financial_phrasebank,tweetnlp,73.21,54.83,58.83
financial_phrasebank,stanza,46.33,44.41,44.15
financial_phrasebank,textblob,46.69,43.97,44.85
financial_phrasebank,vader,33.37,33.35,25.13
