In [1]:
import pandas as pd
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pprint import pprint
from torch.nn.functional import softmax
import numpy as np
from tqdm.notebook import tqdm
import torch

In [3]:
df=pd.read_csv("./2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv")
ds=Dataset.from_pandas(df, preserve_index=False)
ds

Dataset({
    features: ['Unnamed: 0', 'id', 'text', 'label', 'type', 'model_wrong', 'db.model_preds', 'status', 'round', 'split', 'annotator'],
    num_rows: 40623
})

In [6]:
def introduce_toxic_labels(ex_batch):
    return {
        "text": [ex for ex in ex_batch["text"]],
        "toxic": [1 if ex.lower()=="hate" else 0 for ex in ex_batch["label"]]
    }

In [7]:
ds=ds.map(
    introduce_toxic_labels,
    batched=True,
    remove_columns=ds.column_names
)

Map:   0%|          | 0/40623 [00:00<?, ? examples/s]

In [8]:
checkpoint="IMSyPP/hate_speech_en"
model=AutoModelForSequenceClassification.from_pretrained(checkpoint, device_map="auto")
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def test_model(ds, tokenizer, model):
    true_positive=0
    true_negative=0
    false_positive=0
    false_negative=0
    total_count=0
    
    model.eval()

    with torch.inference_mode():
        for ex in tqdm(ds):
            tokens=tokenizer(ex["text"], return_tensors="pt").to(model.device)
            op=model(**tokens)
            logits=op.logits
            prob=softmax(logits, dim=-1)
            pred=np.argmax(prob.cpu().detach().numpy(), axis=-1)[0]

            if(pred==0 and ex["toxic"]==0):
                true_negative+=1
            elif (pred==0 and ex["toxic"]==1):
                false_negative+=1
            elif (pred!=0 and ex["toxic"]==0):
                false_positive+=1
            elif (pred!=0 and ex["toxic"]==1):
                true_positive+=1
            total_count+=1

    precision=true_positive/(true_positive+false_positive)
    recall=true_positive/(true_positive+false_negative)
    f1=(2*precision*recall)/(precision+recall)
    accuracy=(true_positive+true_negative)/(true_positive+false_positive+true_negative+false_negative)
    
    return {
        "no of examples tested": total_count,
        "accuracy": f"{accuracy*100:4f}%",
        "precision": f"{precision*100:4f}%",
        "recall": f"{recall*100:4f}%",
        "f1 score": f"{f1*100:4f}%"
    }

In [11]:
results=test_model(ds, tokenizer,  model)

  0%|          | 0/40623 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (528) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
results