In [1]:
import numpy as np
from datasets import load_dataset
from transformers import pipeline
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

model_path = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# load model into pipeline
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True
    # device='cuda:0'
)

data = load_dataset('rotten_tomatoes')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [4]:
yPred = []
keyDataset = KeyDataset(data['test'], 'text')
piped = pipe(keyDataset)
for output in tqdm(piped, total=len(data['test'])):
    negative_score = output[0]['score']
    positive_score = output[2]['score']
    assignment = np.argmax([negative_score, positive_score])
    yPred.append(assignment)

100%|██████████| 1066/1066 [00:38<00:00, 27.67it/s]


In [6]:
from sklearn.metrics import classification_report

# create and print the classification report
def evaluate_performance(yTrue, yPred):
    performance = classification_report(
        yTrue, yPred,
        target_names=['Negative Review', 'Positive Review']
    )
    print(performance)

evaluate_performance(data['test']['label'], yPred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



In [7]:
data['test']

Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})