In [129]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install simpletransformers transformers[torch] sentencepiece rouge evaluate

Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
"""
Script to evaluate T5 models

Baseline -
1. Majority Voting
2. Hugchat
3. Hugchat (toxic vs non-toxic)

Multitask models
1. T5 Multitask

Classification Models
1. T5 torch + transformers : i/p = sentence o/p = hate speech class
2. T5 simpletransformers : i/p = sentence o/p = hate speech class
3. T5 torch + transformers : i/p = sentence + explanations from T5 model, o/p = hate speech class
4. T5 simpletransformers : i/p = sentence + explanations from T5 model, o/p = hate speech class
5. T5 torch + transformers : i/p = sentence + explanations + keywords from T5 model, o/p = hate speech class
6. T5 simpletransformers : i/p = sentence + explanations + keywords from T5 model, o/p = hate speech class

Text Generation Models
1. T5 for explanation
2. T5 for keywords

Metrics for Classification
1. Standard Metrics - Accuracy, Macro F1, Macro Precision, Macro Recall, AUROC
2. Bias based Metrics - Generalized Mean of Bias (GMB) Subgroup AUC, GMB BPSN AUC, GMB BNSP AUC

Metrics for evaluating Explanations
1. ROUGE, BLEU

Specific metrics for evaluating keywords
1. Explanation based metrics: Plausability (IOU F1, Token F1, AUPRC), Comprehensiveness and Faithfulness
"""



In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, roc_auc_score
import simpletransformers
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
from simpletransformers.t5 import T5Model
import datetime
import time
from datasets import load_dataset
import math
import evaluate
from rouge import Rouge

In [3]:
### Load all datasets

## Load original test data from huggingface
hatexp_dataset = load_dataset('hatexplain')
hatexp_test = pd.DataFrame(hatexp_dataset['test'])

## data with communities targeted
all_feat_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/bert_modeling/"
train_feat_path = all_feat_path + "df_train.csv"
val_feat_path = all_feat_path + "df_val.csv"
test_feat_path = all_feat_path + "df_test.csv"

## base data
base_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/"
train_base_path = base_path + "df_train.csv"
val_base_path = base_path + "df_val.csv"
test_base_path = base_path + "df_test.csv"


## explanations
exp_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5-for-explanation/"
train_explanations_path =  exp_path + "df_train_pred_exp.csv"
val_explanations_path =  exp_path + "df_val_pred_exp.csv"
test_explanations_path =  exp_path + "df_test_pred_exp.csv"

## keywords
kws_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_for_keywords/"
train_kw_path = kws_path + "df_train_pred_kw.csv"
val_kw_path = kws_path + "df_val_pred_kw.csv"
test_kw_path = kws_path + "df_test_pred_kw.csv"


## Read base data
df_train_base = pd.read_csv(train_base_path).astype(str)
df_val_base = pd.read_csv(val_base_path).astype(str)
df_test_base = pd.read_csv(test_base_path).astype(str)


## Read explanations data
df_train_exp = pd.read_csv(train_explanations_path)
df_val_exp = pd.read_csv(val_explanations_path)
df_test_exp = pd.read_csv(test_explanations_path)


## Read keywords data
df_train_kw = pd.read_csv(train_kw_path)
df_val_kw = pd.read_csv(val_kw_path)
df_test_kw = pd.read_csv(test_kw_path)


## Check shapes
print("Base data")
print("Train: ", df_train_base.shape)
print("Val: ", df_val_base.shape)
print("Test: ", df_test_base.shape)


print("Explanations: ")
print("Train: ", df_train_exp.shape)
print("Val: ", df_val_exp.shape)
print("Test: ", df_test_exp.shape)

print("Keywords: ")
print("Train: ", df_train_kw.shape)
print("Val: ", df_val_kw.shape)
print("Test: ", df_test_kw.shape)

Downloading builder script:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading and preparing dataset hatexplain/plain_text to /root/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/15383 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1924 [00:00<?, ? examples/s]

Dataset hatexplain downloaded and prepared to /root/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Base data
Train:  (56273, 5)
Val:  (7147, 5)
Test:  (7042, 5)
Explanations: 
Train:  (14057, 5)
Val:  (1786, 5)
Test:  (1759, 5)
Keywords: 
Train:  (14072, 5)
Val:  (1787, 5)
Test:  (1761, 5)


In [4]:
### Model args for Simpletransformer T5
model_args_cls = {
    "overwrite_output_dir": True,
    "max_seq_length": 250,
    "eval_batch_size": 32,
    "use_multiprocessing": False,
    "num_beams": 3,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

model_args_multitask = {
    "overwrite_output_dir": True,
    "max_seq_length": 196,
    "eval_batch_size": 32,
    "use_multiprocessing": False,
    "num_beams": 1,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

model_args_gen = {
    "overwrite_output_dir": True,
    "max_seq_length": 250,
    "eval_batch_size": 32,
    "use_multiprocessing": False,
    "num_beams": 3,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}


In [20]:
## Load all models

## Classification models

# 1. T5 torch + transformers : i/p = sentence o/p = hate speech class
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_v2/"
t5_tt_1_model = T5ForConditionalGeneration.from_pretrained('t5-base').cuda()  # to GPU
t5_tt_1 = t5_tt_1_model.load_state_dict(torch.load(model_path + 't5-classification.pt'))

# 2. T5 simpletransformers : i/p = sentence o/p = hate speech class
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent/"
t5_st_1 = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)

# 3. T5 torch + transformers : i/p = sentence + explanations from T5 model, o/p = hate speech class
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_v2/"
t5_tt_2_model = T5ForConditionalGeneration.from_pretrained('t5-base').cuda()  # to GPU
t5_tt_2 = t5_tt_2_model.load_state_dict(torch.load(model_path + 't5-classification.pt'))

# 4. T5 simpletransformers : i/p = sentence + explanations from T5 model, o/p = hate speech class
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp/"
t5_st_2 = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)

# 5. T5 torch + transformers : i/p = sentence + explanations + keywords from T5 model, o/p = hate speech class
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_kw_v2/"
t5_tt_3_model = T5ForConditionalGeneration.from_pretrained('t5-base').cuda()  # to GPU
t5_tt_3 = t5_tt_3_model.load_state_dict(torch.load(model_path + 't5-classification.pt'))

# 6. T5 simpletransformers : i/p = sentence + explanations + keywords from T5 model, o/p = hate speech class
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_kw/"
t5_st_3 = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)


## Multitask model
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/"
t5_multitask = T5Model("t5", model_path + "/outputs/best_model", args=model_args_multitask)


## Text Generation Models

# 1. T5 for explanations
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5-for-explanation/"
t5_explanations = T5Model("t5", model_path + "/outputs/best_model", args=model_args_gen)

# 2. T5 for keywords
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_for_keywords/"
t5_keywords = T5Model("t5", model_path + "/outputs/best_model", args=model_args_gen)

## Classification Models

In [5]:
def load_model(model_path):
  t5_tt_1_model = T5ForConditionalGeneration.from_pretrained('t5-base').cuda()  # to GPU
  t5_tt_1_model.load_state_dict(torch.load(model_path + 't5-classification.pt'))
  return t5_tt_1_model

In [6]:
# tokenize the main text
def tokenize_corpus(df, tokenizer, max_len=512):
    # token ID storage
    input_ids = []
    # attension mask storage
    attention_masks = []
    # max len -- 512 is max
    max_len = max_len
    # for every document:
    for doc in df:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            doc,  # document to encode.
                            add_special_tokens=True,  # add tokens relative to model
                            max_length=max_len,  # set max length
                            truncation=True,  # truncate longer messages
                            pad_to_max_length=True,  # add padding
                            return_attention_mask=True,  # create attn. masks
                            return_tensors='pt'  # return pytorch tensors
                       )

        # add the tokenized sentence to the list
        input_ids.append(encoded_dict['input_ids'])

        # and its attention mask (differentiates padding from non-padding)
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [7]:
def get_tensor_dataset(body_tokens, body_masks, target_token, target_masks):
  tensor_df = TensorDataset(body_tokens, body_masks, target_token, target_masks)
  return tensor_df

In [8]:
# time function
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [9]:
## Testing
def testing_tt(model, dataloader, tokenizer, test_stats):

    print("")
    print("Running Testing...")

    # measure training time
    t0 = time.time()

    # put the model in evaluation mode
    model.eval()

    # track variables
    total_test_loss = 0
    total_test_acc = 0
    total_test_f1 = 0
    predictions = []
    actuals = []
    all_prediction_scores = []

    # evaluate data for one epoch
    for step, batch in enumerate(dataloader):
        # progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(dataloader), elapsed))

        # Unpack this training batch from our dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input tokens
        #   [1]: attention masks
        #   [2]: target tokens
        #   [3]: target attenion masks
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_target_ids = batch[2].cuda()
        b_target_mask = batch[3].cuda()

        # tell pytorch not to bother calculating gradients
        # as its only necessary for training
        with torch.no_grad():

            # forward propagation (evaluate model on training batch)
            outputs = model(input_ids=b_input_ids,
                            attention_mask=b_input_mask,
                            labels=b_target_ids,
                            decoder_attention_mask=b_target_mask)

            loss, prediction_scores = outputs[:2]

            total_test_loss += loss.item()

            generated_ids = model.generate(
                    input_ids=b_input_ids,
                    attention_mask=b_input_mask,
                    max_length=3
                    )

            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in b_target_ids]

            total_test_acc += accuracy_score(target, preds)
            total_test_f1 += f1_score(preds, target,
                                       average='weighted',
                                       labels=np.unique(preds))
            predictions.extend(preds)
            actuals.extend(target)
            all_prediction_scores.extend(prediction_scores)

    # calculate the average loss over all of the batches.
    avg_test_loss = total_test_loss / len(dataloader)

    avg_test_acc = total_test_acc / len(dataloader)

    avg_test_f1 = total_test_f1 / len(dataloader)

    # Record all statistics from this epoch.
    test_stats.append(
        {
            'Test Loss': avg_test_loss,
            'Test PPL.': np.exp(avg_test_loss),
            'Test Acc.': avg_test_acc,
            'Test F1': avg_test_f1
        }
    )

    # gloabl df2
    # df2 = pd.DataFrame({'predicted': predictions, 'actual': actuals})

    return test_stats, all_prediction_scores, pd.DataFrame({'predicted': predictions, 'actual': actuals})


In [10]:
# Softmax function
def softmax(x):
    max = np.max(x,axis=1,keepdims=True) #returns max of each row and keeps same dims
    e_x = np.exp(x - max) #subtracts each row with its max value
    sum = np.sum(e_x,axis=1,keepdims=True) #returns sum of each row and keeps same dims
    f_x = e_x / sum
    return f_x

In [11]:
## Offensive token = 12130
## Normal token = 1389
## Hate token = 5591

# Function to get probabilities
# all prediction scores = list of logits of len(test_df)
# ith item in all_prediction_scores contain 2 tensors - logits for first word, and logits for second word over 32K tokens.
# This function extracts the logits for normal, offensive and hate, and converts them to probabilities by using a softmax

def convert_to_prob(all_prediction_scores):
  probs = np.zeros((len(all_prediction_scores), 3))
  for i in range(len(all_prediction_scores)):
    ## extract logits for normal, hate and offensive
    offensive_logit = all_prediction_scores[i][0][12130]
    normal_logit = all_prediction_scores[i][0][1389]
    hate_logit = all_prediction_scores[i][0][5591]
    probs[i][0] = normal_logit
    probs[i][1] = offensive_logit
    probs[i][2] = hate_logit
  probs_softmax = softmax(probs)
  df_probs = pd.DataFrame(probs_softmax, columns=['prob_normal','prob_offensive','prob_hate'])
  return df_probs

In [12]:
## Function to calculate standard metrics for classification

def calculate_standard_metrics(df_test):
  # Convert to labels
  df_test['actual_label'] = df_test['actual'].map({'hate':0, 'normal':1, 'offensive':2})
  df_test['predicted_label'] = df_test['predicted'].map({'hate':0, 'normal':1, 'offensive':2})
  # Acc, P, R, F1
  y_true = df_test['actual_label'].values
  y_pred = df_test['predicted_label'].values
  target_names = ['hate','normal','offensive']
  print("Standard Metrics:")
  print(classification_report(y_true, y_pred, target_names=target_names, digits=4))
  # AUC - avg AUC over hate, normal and offensive
  # Create OHE labels
  normal_label = ((df_test['actual'] == 'normal')*1).values
  hate_label = ((df_test['actual'] == 'hate')*1).values
  offensive_label = ((df_test['actual'] == 'offensive')*1).values
  auc_normal = roc_auc_score(normal_label, df_test['prob_normal'].values)
  auc_offensive = roc_auc_score(offensive_label, df_test['prob_offensive'].values)
  auc_hate = roc_auc_score(hate_label, df_test['prob_hate'].values)
  auc_macro = (auc_normal + auc_offensive + auc_hate)/3
  print("AUROC: ", auc_macro)

In [13]:
"""
Steps for calculating bias based metrics:
1. Combine hate speech and offensive into one class: toxic, and normal = non_toxic
2. Get target communitites: if 2 out of 3 annotators select the same community, we select that community as target.
3. Select top10 communities: ['African', 'Islam', 'Jewish', 'Homosexual', 'Women', 'Refugee', 'Arab', 'Caucasian','Asian', 'Hispanic']
4. Remove other communities from calculation: Make them 'None'
5.
"""

"\nSteps for calculating bias based metrics: \n1. Combine hate speech and offensive into one class: toxic, and normal = non_toxic \n2. Get target communitites: if 2 out of 3 annotators select the same community, we select that community as target. \n3. Select top10 communities: ['African', 'Islam', 'Jewish', 'Homosexual', 'Women', 'Refugee', 'Arab', 'Caucasian','Asian', 'Hispanic']\n4. Remove other communities from calculation: Make them 'None'\n5. \n"

In [14]:
## Function to get toxic label
def get_toxic_label_probs(df):
  df['toxic_label'] = df['actual'].apply(lambda x: 'non-toxic' if x=='normal' else 'toxic')
  df['toxic_prob'] = df['prob_offensive'] + df['prob_hate']
  df['non_toxic_prob'] = df['prob_normal'].copy()

## Helper function to get target communities. If more than 2 annotators describe them as target.
def get_target_comm_helper(annotators, communities):
  targets = annotators['target']
  # Initialize comunity freq dict
  community_freq_dict = dict()
  for comm in communities:
    community_freq_dict[comm] = 0
  # Get frequencies for target comm.
  for ann in annotators['target']:
    for item in ann:
      if item in community_freq_dict:
        community_freq_dict[item] += 1
  # target comms list
  target_comms = []
  for comm, freq in community_freq_dict.items():
    if freq>=2:
      target_comms.append(comm)
  if len(target_comms)==0:
    return None
  return target_comms

# Get target communities
# Input df from hatexplain test dataset
def get_target_comms(df, communities):
  df['target_comms'] = df['annotators'].apply(lambda x: get_target_comm_helper(x, communities))


In [15]:
## Function to calculate Subgroup AUC
## Input df contains probabilities for each record and targeted communities
def subgroup_auc(df, communities):
  # Remove entries with None in target_comms
  df_comms = df[~df['target_comms'].isnull()].copy()
  # Dictionary with AUCs of all communities
  subgroup_auc = dict()
  for community in communities:
    df_subgroup = df_comms[df_comms['target_comms'].apply(lambda x: community in x)].copy()
    toxic_label = df_subgroup['toxic_label'].apply(lambda x: 1 if x=='toxic' else 0).values
    auc_score = roc_auc_score(toxic_label, df_subgroup['toxic_prob'].values)
    subgroup_auc[community] = auc_score
  return subgroup_auc


In [16]:
# Function to calculate BPSN AUC
# 1. Take normal posts that mention the community and toxic posts that don't mention the community.
# 2. Get AUC
def bpsn_auc(df, communities):
  # Remove entries with None in target_comms
  df_comms = df[~df['target_comms'].isnull()].copy()
  # Dictionary with AUCs of all communities
  bpsn_auc = dict()
  for community in communities:
    # Background Positive
    df_normal = df_comms[df_comms['toxic_label']=='non-toxic'].copy()
    df_normal_comm = df_normal[df_normal['target_comms'].apply(lambda x: community in x)].copy()
    # Subgroup Negative
    df_toxic = df_comms[df_comms['toxic_label']=='toxic'].copy()
    df_toxic_nocomm = df_toxic[df_toxic['target_comms'].apply(lambda x: community not in x)].copy()
    # Concat
    df_bpsn = pd.concat([df_normal_comm, df_toxic_nocomm], ignore_index=True)
    toxic_label = df_bpsn['toxic_label'].apply(lambda x: 1 if x=='toxic' else 0).values
    # Calculate AUC
    auc_score = roc_auc_score(toxic_label, df_bpsn['toxic_prob'].values)
    bpsn_auc[community] = auc_score
  return bpsn_auc


In [17]:
# Function to calculate BNSP AUC
# 1. Take toxic posts that mention the community and normal posts that don't mention the community.
# 2. Get AUC
def bnsp_auc(df, communities):
  # Remove entries with None in target_comms
  df_comms = df[~df['target_comms'].isnull()].copy()
  # Dictionary with AUCs of all communities
  bnsp_auc = dict()
  for community in communities:
    # Background Negative
    df_toxic = df_comms[df_comms['toxic_label']=='toxic'].copy()
    df_toxic_comm = df_toxic[df_toxic['target_comms'].apply(lambda x: community in x)].copy()
    # Subgroup Positive
    df_normal = df_comms[df_comms['toxic_label']=='non-toxic'].copy()
    df_normal_nocomm = df_normal[df_normal['target_comms'].apply(lambda x: community not in x)].copy()
    # Concat
    df_bnsp = pd.concat([df_toxic_comm, df_normal_nocomm], ignore_index=True)
    toxic_label = df_bnsp['toxic_label'].apply(lambda x: 1 if x=='toxic' else 0).values
    # Calculate AUC
    auc_score = roc_auc_score(toxic_label, df_bnsp['toxic_prob'].values)
    bnsp_auc[community] = auc_score
  return bnsp_auc

In [18]:
# Function to calculate Generalized Mean Bias
# ((1/N) * (sum_over_N(m^p)))^(1/p)
def gmb(bias_metric_dict):
  p = -5 # suggested in paper
  N = len(bias_metric_dict)
  sum = 0
  for comm, bias_val in bias_metric_dict.items():
    sum += math.pow(bias_val, p)
  avg = sum/N
  final_val = math.pow(avg, (1/p))
  return final_val

In [19]:
## Function to calculate bias based metrics for classification

def calculate_bias_metrics(df_test, hatexp_test):
  communities = ['African', 'Islam', 'Jewish', 'Homosexual', 'Women', 'Refugee', 'Arab', 'Caucasian','Asian', 'Hispanic']
  get_toxic_label_probs(df_test)
  get_target_comms(hatexp_test, communities)
  df_test_merged = df_test.merge(hatexp_test, on=['id'], how='inner')
  assert(df_test_merged.shape[0] == df_test.shape[0])
  subgroup_auc_score = subgroup_auc(df_test_merged, communities)
  bpsn_auc_score = bpsn_auc(df_test_merged, communities)
  bnsp_auc_score = bnsp_auc(df_test_merged, communities)
  gmb_subgroup = gmb(subgroup_auc_score)
  gmb_bpsn = gmb(bpsn_auc_score)
  gmb_bnsp = gmb(bnsp_auc_score)
  print("Subgroup AUC: \n", subgroup_auc_score)
  print("BPSN AUC: \n", bpsn_auc_score)
  print("BNSP AUC: \n", bnsp_auc_score)
  print("GMB Subgroup: ", gmb_subgroup)
  print("GMB BPSN: ", gmb_bpsn)
  print("GMB BNSP: ", gmb_bnsp)
  return subgroup_auc_score, bpsn_auc_score, bnsp_auc_score

### 1. T5 torch + transformers : i/p = sentence o/p = hate speech class

In [20]:
def data_for_testing_tt_1(df):
  ## Select data-points with prefix = 'label'
  df_test = df[df['prefix']=='label'].copy()
  df_test = df_test.astype(str)
  df_test.reset_index(drop=True, inplace=True)
  ## Prepare input_text
  df_test.rename(columns={'input_text':'sentence'}, inplace=True)
  df_test['input_text'] = df_test.apply(lambda row: row['prefix'] + " : " + row['sentence'], axis=1)
  return df_test

In [21]:
def get_predictions_tt1(df_test_base, model_path):
  # Get test data
  df_test = data_for_testing_tt_1(df_test_base)
  print("DF Test Shape: ", df_test.shape)
  # Instantiate tokenizer
  tokenizer = T5Tokenizer.from_pretrained('t5-base')
  # create tokenized data - input_text
  test_body_input_ids, test_body_attention_masks = tokenize_corpus(df_test['input_text'].values, tokenizer)
  # create tokenized data - target_text - max_len=2
  test_target_input_ids, test_target_attention_masks = tokenize_corpus(df_test['target_text'].values, tokenizer, max_len=2)
  # create tensor dataset
  test_dataset = get_tensor_dataset(test_body_input_ids, test_body_attention_masks, test_target_input_ids, test_target_attention_masks)
  # create dataloader
  test_dataloader = DataLoader(test_dataset, batch_size=24, shuffle=False)
  # get predictions
  # df2 = pd.DataFrame({'predicted': [], 'actual': []})
  test_stats = []
  # Load model
  model = load_model(model_path)
  test_stats, all_prediction_scores, df2 = testing_tt(model, test_dataloader, tokenizer, test_stats)
  print("Test stats: \n", test_stats)
  print("Predictions shape: ", df2.shape)
  # get probabilties
  # Convert logits to probability
  df_probs = convert_to_prob(all_prediction_scores)
  # Append columns to test
  df_test['predicted'] = df2['predicted'].copy()
  df_test['actual'] = df2['actual'].copy()
  df_test['prob_normal'] = df_probs['prob_normal'].copy()
  df_test['prob_offensive'] = df_probs['prob_offensive'].copy()
  df_test['prob_hate'] = df_probs['prob_hate'].copy()
  return df_test

In [22]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [23]:
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_v2/"
df_test = get_predictions_tt1(df_test_base, model_path)
calculate_standard_metrics(df_test)
calculate_bias_metrics(df_test, hatexp_test)

DF Test Shape:  (1761, 6)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Running Testing...
  Batch    40  of     74.    Elapsed: 0:00:20.
Test stats: 
 [{'Test Loss': 0.3919868878013379, 'Test PPL.': 1.479918306292231, 'Test Acc.': 0.6672297297297295, 'Test F1': 0.681737862677551}]
Predictions shape:  (1761, 2)
Standard Metrics:
              precision    recall  f1-score   support

        hate     0.6652    0.7982    0.7256       550
      normal     0.7001    0.7650    0.7311       702
   offensive     0.5958    0.3910    0.4721       509

    accuracy                         0.6672      1761
   macro avg     0.6537    0.6514    0.6430      1761
weighted avg     0.6591    0.6672    0.6545      1761

AUROC:  0.8248907188327741
Subgroup AUC: 
 {'African': 0.7715442452284558, 'Islam': 0.8812360387192852, 'Jewish': 0.8653846153846154, 'Homosexual': 0.8541666666666666, 'Women': 0.7783898305084745, 'Refugee': 0.797037037037037, 'Arab': 0.7298701298701299, 'Caucasian': 0.7488038277511961, 'Asian': 0.8842592592592592, 'Hispanic': 0.875}
BPSN AUC: 
 {'African':

({'African': 0.7715442452284558,
  'Islam': 0.8812360387192852,
  'Jewish': 0.8653846153846154,
  'Homosexual': 0.8541666666666666,
  'Women': 0.7783898305084745,
  'Refugee': 0.797037037037037,
  'Arab': 0.7298701298701299,
  'Caucasian': 0.7488038277511961,
  'Asian': 0.8842592592592592,
  'Hispanic': 0.875},
 {'African': 0.6070293182620983,
  'Islam': 0.8082237114254984,
  'Jewish': 0.8164261690354778,
  'Homosexual': 0.8777429467084639,
  'Women': 0.8751664447403462,
  'Refugee': 0.9162935829602495,
  'Arab': 0.7165672165672167,
  'Caucasian': 0.9363308171972432,
  'Asian': 0.9271080760095012,
  'Hispanic': 0.9056152927120669},
 {'African': 0.9326133636478464,
  'Islam': 0.8957305299291998,
  'Jewish': 0.8878073770491803,
  'Homosexual': 0.8195058997050146,
  'Women': 0.7649226234340457,
  'Refugee': 0.6788888888888889,
  'Arab': 0.8857954545454545,
  'Caucasian': 0.6129032258064515,
  'Asian': 0.7605715952172645,
  'Hispanic': 0.8544776119402985})

### 2. T5 torch + transformers : i/p = sentence + explanations from T5 model, o/p = hate speech class

In [24]:
def data_for_testing_tt_2(df_test_base, df_test_exp):
  df_test = df_test_base[['id','prefix','input_text','target_text']].copy()

  ## Select data-points with prefix = 'label'
  df_test = df_test[df_test['prefix']=='label'].copy()
  df_test = df_test.astype(str)
  df_test.reset_index(drop=True, inplace=True)

  ## Rename column input_text to sentence and predicted to predicted_exp
  df_test.rename(columns = {'input_text':'sentence'}, inplace=True)
  df_test_exp.rename(columns = {'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)

  ## Merge with df_test_exp
  df_test = df_test.merge(df_test_exp[['sentence','predicted_exp']], on = ['sentence'], how = 'inner')
  df_test = df_test.astype(str)

  ## Get new input_text
  df_test['input_text'] = df_test.apply(lambda row: row['prefix'] + " : " + row['sentence'] + '. ' + row['predicted_exp'], axis=1)
  df_test.drop_duplicates(subset=['input_text'], keep='first', inplace=True)

  return df_test


In [25]:
def get_predictions_tt2(df_test_base, df_test_exp, model_path):
  df_test_base_copy = df_test_base.copy()
  # Get test data
  df_test = data_for_testing_tt_2(df_test_base_copy, df_test_exp)
  print("DF Test Shape: ", df_test.shape)
  # Instantiate tokenizer
  tokenizer = T5Tokenizer.from_pretrained('t5-base')
  # create tokenized data - input_text
  test_body_input_ids, test_body_attention_masks = tokenize_corpus(df_test['input_text'].values, tokenizer)
  # create tokenized data - target_text - max_len=2
  test_target_input_ids, test_target_attention_masks = tokenize_corpus(df_test['target_text'].values, tokenizer, max_len=2)
  # create tensor dataset
  test_dataset = get_tensor_dataset(test_body_input_ids, test_body_attention_masks, test_target_input_ids, test_target_attention_masks)
  # create dataloader
  test_dataloader = DataLoader(test_dataset, batch_size=24, shuffle=False)
  # get predictions
  # df2 = pd.DataFrame({'predicted': [], 'actual': []})
  test_stats = []
  # Load model
  model = load_model(model_path)
  test_stats, all_prediction_scores, df2 = testing_tt(model, test_dataloader, tokenizer, test_stats)
  print("Test stats: \n", test_stats)
  print("Predictions shape: ", df2.shape)
  # get probabilties
  # Convert logits to probability
  df_probs = convert_to_prob(all_prediction_scores)
  # Append columns to test
  df_test['predicted'] = df2['predicted'].copy()
  df_test['actual'] = df2['actual'].copy()
  df_test['prob_normal'] = df_probs['prob_normal'].copy()
  df_test['prob_offensive'] = df_probs['prob_offensive'].copy()
  df_test['prob_hate'] = df_probs['prob_hate'].copy()
  return df_test


In [26]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [27]:
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_v2/"
t5_tt_2_model = T5ForConditionalGeneration.from_pretrained('t5-base').cuda()  # to GPU
t5_tt_2_model.load_state_dict(torch.load(model_path + 't5-classification.pt'))

df_test = get_predictions_tt2(df_test_base, df_test_exp, model_path)
calculate_standard_metrics(df_test)
subgroup_auc_score, bpsn_auc_score, bnsp_auc_score = calculate_bias_metrics(df_test, hatexp_test)

DF Test Shape:  (1759, 6)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



Running Testing...
  Batch    40  of     74.    Elapsed: 0:00:18.
Test stats: 
 [{'Test Loss': 0.37189942980940277, 'Test PPL.': 1.450487098193612, 'Test Acc.': 0.6842020592020591, 'Test F1': 0.6876648992548108}]
Predictions shape:  (1759, 2)
Standard Metrics:
              precision    recall  f1-score   support

        hate     0.7283    0.7945    0.7600       550
      normal     0.7386    0.7133    0.7257       701
   offensive     0.5519    0.5236    0.5374       508

    accuracy                         0.6839      1759
   macro avg     0.6729    0.6771    0.6744      1759
weighted avg     0.6814    0.6839    0.6820      1759

AUROC:  0.8446940508994675
Subgroup AUC: 
 {'African': 0.767881241565452, 'Islam': 0.8935219657483247, 'Jewish': 0.8333333333333333, 'Homosexual': 0.8931977113795295, 'Women': 0.7902542372881356, 'Refugee': 0.7385185185185185, 'Arab': 0.8051948051948051, 'Caucasian': 0.7535885167464115, 'Asian': 0.8703703703703703, 'Hispanic': 0.96875}
BPSN AUC: 
 {'Afric

### 3. T5 torch + transformers : i/p = sentence + explanations + keywords from T5 model, o/p = hate speech class


In [28]:
def data_for_testing_tt_3(df_test_base, df_test_exp, df_test_kw):
  df_test = df_test_base[['id','prefix','input_text','target_text']].copy()

  ## Select data-points with prefix = 'label'
  df_test = df_test[df_test['prefix']=='label'].copy()
  df_test = df_test.astype(str)
  df_test.reset_index(drop=True, inplace=True)

  ## Rename column input_text to sentence and predicted to predicted_exp
  df_test.rename(columns = {'input_text':'sentence'}, inplace=True)
  df_test_exp.rename(columns = {'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)
  df_test_kw.rename(columns = {'input_text':'sentence', 'predicted':'predicted_kw'}, inplace=True)

  ## Merge with df_test_exp
  df_test = df_test.merge(df_test_exp[['sentence','predicted_exp']], on = ['sentence'], how = 'inner')
  df_test = df_test.astype(str)

  ## Merge with df_test_kw
  df_test = df_test.merge(df_test_kw[['sentence','predicted_kw']], on = ['sentence'], how = 'inner')
  df_test = df_test.astype(str)

  kw_prefix = " The keywords in the sentence are: "

  ## Get new input_text
  df_test['input_text'] = df_test.apply(lambda row: row['prefix'] + " : " + row['sentence'] + '. ' + row['predicted_exp'] + kw_prefix + row['predicted_kw'], axis=1)

  df_test.drop_duplicates(subset=['input_text'], keep='first', inplace=True)

  return df_test

In [29]:
def get_predictions_tt3(df_test_base, df_test_exp, df_test_kw, model_path):
  df_test_base_copy = df_test_base.copy()
  # Get test data
  df_test = data_for_testing_tt_3(df_test_base_copy, df_test_exp, df_test_kw)
  print("DF Test Shape: ", df_test.shape)
  # Instantiate tokenizer
  tokenizer = T5Tokenizer.from_pretrained('t5-base')
  # create tokenized data - input_text
  test_body_input_ids, test_body_attention_masks = tokenize_corpus(df_test['input_text'].values, tokenizer)
  # create tokenized data - target_text - max_len=2
  test_target_input_ids, test_target_attention_masks = tokenize_corpus(df_test['target_text'].values, tokenizer, max_len=2)
  # create tensor dataset
  test_dataset = get_tensor_dataset(test_body_input_ids, test_body_attention_masks, test_target_input_ids, test_target_attention_masks)
  # create dataloader
  test_dataloader = DataLoader(test_dataset, batch_size=24, shuffle=False)
  # get predictions
  # df2 = pd.DataFrame({'predicted': [], 'actual': []})
  test_stats = []
  # Load model
  model = load_model(model_path)
  test_stats, all_prediction_scores, df2 = testing_tt(model, test_dataloader, tokenizer, test_stats)
  print("Test stats: \n", test_stats)
  print("Predictions shape: ", df2.shape)
  # get probabilties
  # Convert logits to probability
  df_probs = convert_to_prob(all_prediction_scores)
  # Append columns to test
  df_test['predicted'] = df2['predicted'].copy()
  df_test['actual'] = df2['actual'].copy()
  df_test['prob_normal'] = df_probs['prob_normal'].copy()
  df_test['prob_offensive'] = df_probs['prob_offensive'].copy()
  df_test['prob_hate'] = df_probs['prob_hate'].copy()

  return df_test

In [30]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [31]:
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_kw_v2/"
t5_tt_3_model = T5ForConditionalGeneration.from_pretrained('t5-base').cuda()  # to GPU
t5_tt_3_model.load_state_dict(torch.load(model_path + 't5-classification.pt'))

df_test = get_predictions_tt3(df_test_base, df_test_exp, df_test_kw, model_path)
calculate_standard_metrics(df_test)
sub, bpsn, bnsp = calculate_bias_metrics(df_test, hatexp_test)

DF Test Shape:  (1759, 7)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



Running Testing...
  Batch    40  of     74.    Elapsed: 0:00:18.
Test stats: 
 [{'Test Loss': 0.38186034862254115, 'Test PPL.': 1.4650074805339912, 'Test Acc.': 0.66988416988417, 'Test F1': 0.6701422786063638}]
Predictions shape:  (1759, 2)
Standard Metrics:
              precision    recall  f1-score   support

        hate     0.7072    0.8345    0.7656       550
      normal     0.7675    0.6262    0.6897       701
   offensive     0.5242    0.5551    0.5392       508

    accuracy                         0.6708      1759
   macro avg     0.6663    0.6720    0.6648      1759
weighted avg     0.6784    0.6708    0.6700      1759

AUROC:  0.8405514586154164
Subgroup AUC: 
 {'African': 0.7659533449007134, 'Islam': 0.8957557706626954, 'Jewish': 0.7875, 'Homosexual': 0.856325492689129, 'Women': 0.8258474576271186, 'Refugee': 0.78, 'Arab': 0.7974025974025974, 'Caucasian': 0.8133971291866029, 'Asian': 0.6712962962962963, 'Hispanic': 0.96875}
BPSN AUC: 
 {'African': 0.7045555064130916, 'I

### 4. T5 simpletransformers : i/p = sentence o/p = hate speech class


In [32]:
def data_for_testing_st_1(df_test_base):
  ## Select data-points with prefix = 'label'
  df_test = df_test_base[df_test_base['prefix']=='label'].copy()
  df_test = df_test.astype(str)
  df_test.reset_index(drop=True, inplace=True)
  to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df_test["prefix"].tolist(), df_test["input_text"].tolist())
    ]
  truth = df_test["target_text"].tolist()
  tasks = df_test["prefix"].tolist()
  return to_predict, truth, tasks

In [33]:
def get_predictions_st1(df_test_base, model_path):
  to_predict, truth, tasks = data_for_testing_st_1(df_test_base)
  model = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)

  df_test = df_test_base[df_test_base['prefix']=='label'].copy()

  # Get the model predictions
  preds = model.predict(to_predict)

  ## Taking only the top (first) predictions
  top_preds = [pred[0] for pred in preds]
  df_test["predicted"] = top_preds
  df_test['all_preds'] = preds

  # Classification Metrics
  df_test['actual_label'] = df_test['target_text'].map({'hate':0, 'normal':1, 'offensive':2})
  df_test['predicted_label'] = df_test['predicted'].map({'hate':0, 'normal':1, 'offensive':2})

  # Acc, P, R, F1
  y_true = df_test['actual_label'].values
  y_pred = df_test['predicted_label'].values
  target_names = ['hate','normal','offensive']
  print("Standard Metrics:")
  print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

  return df_test

In [34]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [35]:
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent/"
df_test = get_predictions_st1(df_test_base, model_path)

Generating outputs:   0%|          | 0/56 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/5283 [00:00<?, ?it/s]

Standard Metrics:
              precision    recall  f1-score   support

        hate     0.7032    0.8273    0.7602       550
      normal     0.7968    0.6368    0.7078       702
   offensive     0.5407    0.5874    0.5631       509

    accuracy                         0.6820      1761
   macro avg     0.6802    0.6838    0.6771      1761
weighted avg     0.6936    0.6820    0.6824      1761



### 5. T5 simpletransformers : i/p = sentence + explanations from T5 model, o/p = hate speech class


In [36]:
def data_for_testing_st_2(df_test_base, df_test_exp):
  ## Select data-points with prefix = 'label'
  df_test = df_test_base[df_test_base['prefix']=='label'].copy()
  df_test = df_test.astype(str)

  ## Rename column input_text to sentence and predicted to predicted_exp
  df_test.rename(columns = {'input_text':'sentence'}, inplace=True)
  df_test_exp.rename(columns = {'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)
  df_test.reset_index(drop=True, inplace=True)

  ## Merge
  df_test = df_test.merge(df_test_exp[['sentence','predicted_exp']], on = ['sentence'], how = 'inner')
  df_test = df_test.astype(str)

  ## Get new input_text
  df_test['input_text'] = df_test.apply(lambda row: row['sentence'] + ' ' + row['predicted_exp'], axis=1)
  df_test.drop_duplicates(subset=['input_text'], keep='first', inplace=True)

  to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df_test["prefix"].tolist(), df_test["input_text"].tolist())
    ]

  truth = df_test["target_text"].tolist()
  tasks = df_test["prefix"].tolist()
  return to_predict, truth, tasks, df_test

In [37]:
def get_predictions_st2(df_test_base, df_test_exp, model_path):
  to_predict, truth, tasks, df_test = data_for_testing_st_2(df_test_base, df_test_exp)
  model = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)

  # Get the model predictions
  preds = model.predict(to_predict)

  ## Taking only the top (first) predictions
  top_preds = [pred[0] for pred in preds]
  df_test["predicted"] = top_preds
  df_test['all_preds'] = preds

  # Classification Metrics
  df_test['actual_label'] = df_test['target_text'].map({'hate':0, 'normal':1, 'offensive':2})
  df_test['predicted_label'] = df_test['predicted'].map({'hate':0, 'normal':1, 'offensive':2})

  # Acc, P, R, F1
  y_true = df_test['actual_label'].values
  y_pred = df_test['predicted_label'].values
  target_names = ['hate','normal','offensive']
  print("Standard Metrics:")
  print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

  return df_test

In [38]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [39]:
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp/"
df_test = get_predictions_st2(df_test_base, df_test_exp, model_path)

Generating outputs:   0%|          | 0/55 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/5277 [00:00<?, ?it/s]

Standard Metrics:
              precision    recall  f1-score   support

        hate     0.7711    0.7655    0.7682       550
      normal     0.7120    0.7689    0.7394       701
   offensive     0.5833    0.5236    0.5519       508

    accuracy                         0.6970      1759
   macro avg     0.6888    0.6860    0.6865      1759
weighted avg     0.6933    0.6970    0.6942      1759



### 6. T5 simpletransformers : i/p = sentence + explanations + keywords from T5 model, o/p = hate speech class


In [40]:
def data_for_testing_st_3(df_test_base, df_test_exp, df_test_kw):
  ## Select data-points with prefix = 'label'
  df_test = df_test_base[df_test_base['prefix']=='label'].copy()
  df_test = df_test.astype(str)

  ## Rename column input_text to sentence and predicted to predicted_exp
  df_test.rename(columns = {'input_text':'sentence'}, inplace=True)
  df_test_exp.rename(columns = {'input_text':'sentence', 'predicted':'predicted_exp'}, inplace=True)
  df_test_kw.rename(columns = {'input_text':'sentence', 'predicted':'predicted_kw'}, inplace=True)

  ## Merge with explanations
  df_test = df_test.merge(df_test_exp[['sentence','predicted_exp']], on = ['sentence'], how = 'inner')
  df_test = df_test.astype(str)

  ## Merge with keywords
  df_test = df_test.merge(df_test_kw[['sentence','predicted_kw']], on = ['sentence'], how = 'inner')
  df_test = df_test.astype(str)

  ## Get new input_text
  kw_prefix = " The keywords in the sentence are: "
  df_test['input_text'] = df_test.apply(lambda row: row['sentence'] + ' ' + row['predicted_exp'] + kw_prefix + row['predicted_kw'], axis=1)
  df_test.drop_duplicates(subset=['input_text'], keep='first', inplace=True)

  to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df_test["prefix"].tolist(), df_test["input_text"].tolist())
    ]

  truth = df_test["target_text"].tolist()
  tasks = df_test["prefix"].tolist()
  return to_predict, truth, tasks, df_test

In [41]:
def get_predictions_st3(df_test_base, df_test_exp, df_test_kw, model_path):
  to_predict, truth, tasks, df_test = data_for_testing_st_3(df_test_base, df_test_exp, df_test_kw)
  model = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)

  # Get the model predictions
  preds = model.predict(to_predict)

  ## Taking only the top (first) predictions
  top_preds = [pred[0] for pred in preds]
  df_test["predicted"] = top_preds
  df_test['all_preds'] = preds

  # Classification Metrics
  df_test['actual_label'] = df_test['target_text'].map({'hate':0, 'normal':1, 'offensive':2})
  df_test['predicted_label'] = df_test['predicted'].map({'hate':0, 'normal':1, 'offensive':2})

  # Acc, P, R, F1
  y_true = df_test['actual_label'].values
  y_pred = df_test['predicted_label'].values
  target_names = ['hate','normal','offensive']
  print("Standard Metrics:")
  print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

  return df_test

In [42]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [43]:
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_exp_kw/"
df_test = get_predictions_st3(df_test_base, df_test_exp, df_test_kw, model_path)

Generating outputs:   0%|          | 0/55 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/5277 [00:00<?, ?it/s]

Standard Metrics:
              precision    recall  f1-score   support

        hate     0.7633    0.7564    0.7598       550
      normal     0.7551    0.6862    0.7190       701
   offensive     0.5165    0.5866    0.5493       508

    accuracy                         0.6794      1759
   macro avg     0.6783    0.6764    0.6760      1759
weighted avg     0.6887    0.6794    0.6827      1759



## Text Generation Models
We already have predicted explanations (df_test_exp) and keywords (df_test_kw) from T5 models. Here, we check the ROUGE and BLEU metrics in the predicted columns for these datasets. For details, please check the notebook t5-for-explanation.ipynb and t5-for-keywords.ipynb

## T5 for Explanation
Calculating ROUGE and BLEU metrics



In [44]:
def calculate_rouge_scores(df):
  rouge = Rouge()
  rouge_scores = rouge.get_scores(df['predicted_exp'], df['target_text'])
  avg_rouge_scores = {'rouge-1': {'r':0, 'p':0, 'f':0}, 'rouge-2': {'r':0, 'p':0, 'f':0}, 'rouge-l': {'r':0, 'p':0, 'f':0}}
  for scores in rouge_scores:
    rog1 = scores['rouge-1']
    avg_rouge_scores['rouge-1']['r'] += rog1['r']
    avg_rouge_scores['rouge-1']['p'] += rog1['p']
    avg_rouge_scores['rouge-1']['f'] += rog1['f']
    rog2 = scores['rouge-2']
    avg_rouge_scores['rouge-2']['r'] += rog2['r']
    avg_rouge_scores['rouge-2']['p'] += rog2['p']
    avg_rouge_scores['rouge-2']['f'] += rog2['f']
    rogl = scores['rouge-l']
    avg_rouge_scores['rouge-l']['r'] += rogl['r']
    avg_rouge_scores['rouge-l']['p'] += rogl['p']
    avg_rouge_scores['rouge-l']['f'] += rogl['f']

  avg_rouge_scores['rouge-1']['r'] = avg_rouge_scores['rouge-1']['r']/df.shape[0]
  avg_rouge_scores['rouge-1']['p'] = avg_rouge_scores['rouge-1']['p']/df.shape[0]
  avg_rouge_scores['rouge-1']['f'] = avg_rouge_scores['rouge-1']['f']/df.shape[0]

  avg_rouge_scores['rouge-2']['r'] = avg_rouge_scores['rouge-2']['r']/df.shape[0]
  avg_rouge_scores['rouge-2']['p'] = avg_rouge_scores['rouge-2']['p']/df.shape[0]
  avg_rouge_scores['rouge-2']['f'] = avg_rouge_scores['rouge-2']['f']/df.shape[0]

  avg_rouge_scores['rouge-l']['r'] = avg_rouge_scores['rouge-l']['r']/df.shape[0]
  avg_rouge_scores['rouge-l']['p'] = avg_rouge_scores['rouge-l']['p']/df.shape[0]
  avg_rouge_scores['rouge-l']['f'] = avg_rouge_scores['rouge-l']['f']/df.shape[0]

  return avg_rouge_scores



In [45]:
def calculate_bleu(df):
  bleu = evaluate.load('bleu')
  predictions = df['predicted_exp'].tolist()
  references = df['target_text'].tolist()
  results = bleu.compute(predictions=predictions, references=references, max_order = 2)
  return results

In [46]:
avg_rouge_scores = calculate_rouge_scores(df_test_exp)
bleu_scores = calculate_bleu(df_test_exp)
print("Rouge Scores: \n", avg_rouge_scores)
print("Bleu Scores: \n", bleu_scores)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Rouge Scores: 
 {'rouge-1': {'r': 0.2414293347082444, 'p': 0.4371107842583683, 'f': 0.3014995599670864}, 'rouge-2': {'r': 0.07828095940379103, 'p': 0.14939234110020794, 'f': 0.09869231171676551}, 'rouge-l': {'r': 0.22078571143542775, 'p': 0.4000124100619905, 'f': 0.27572835389879646}}
Bleu Scores: 
 {'bleu': 0.12364308765841156, 'precisions': [0.4999925114948554, 0.1890324565451469], 'brevity_penalty': 0.4021794750048545, 'length_ratio': 0.5233254432304485, 'translation_length': 66769, 'reference_length': 127586}


In [47]:
avg_rouge_scores

{'rouge-1': {'r': 0.2414293347082444,
  'p': 0.4371107842583683,
  'f': 0.3014995599670864},
 'rouge-2': {'r': 0.07828095940379103,
  'p': 0.14939234110020794,
  'f': 0.09869231171676551},
 'rouge-l': {'r': 0.22078571143542775,
  'p': 0.4000124100619905,
  'f': 0.27572835389879646}}

In [48]:
bleu_scores

{'bleu': 0.12364308765841156,
 'precisions': [0.4999925114948554, 0.1890324565451469],
 'brevity_penalty': 0.4021794750048545,
 'length_ratio': 0.5233254432304485,
 'translation_length': 66769,
 'reference_length': 127586}

### T5 for Keywords

Here we calculate the quality of keywords generated by T5 model. Metrics used: IOU, Comprehensiveness and Faithfulness. We use the

In [49]:
## Functions to get IoU

def get_iou_helper(predicted_kw, target_kw):
  predicted_kw_set = set(predicted_kw.split())
  target_kw_set = set(target_kw.split())
  inter = len(predicted_kw_set.intersection(target_kw_set))
  uni = len(predicted_kw_set.union(target_kw_set))
  iou = inter/uni
  return iou

def get_iou(df):
  df['predicted_kw'] = df['predicted_kw'].apply(str)
  df['target_text'] = df['target_text'].apply(str)
  iou = df.apply(lambda row: get_iou_helper(row['predicted_kw'], row['target_text']), axis=1)
  return iou.mean()

In [50]:
## Function to get contrast example by removing predicted keywords
def create_contrast_example_helper(predicted_kw, sentence):
  predicted_kw_list = predicted_kw.split()
  sentence_list = sentence.split()
  contrast_example_list = [x for x in sentence_list if x not in predicted_kw_list]
  contrast_example = ' '.join([x for x in contrast_example_list])
  return contrast_example

def create_contrast_example(df):
  df_copy = df.copy()
  df_copy['predicted_kw'] = df_copy['predicted_kw'].apply(str)
  df_copy['sentence'] = df_copy['sentence'].apply(str)
  df_copy['contrast_example'] = df_copy.apply(lambda row: create_contrast_example_helper(row['predicted_kw'], row['sentence']), axis=1)
  return df_copy


In [51]:
def data_for_testing_tt_4(df, df_contrast):
  ## Select data-points with prefix = 'label'
  df_test = df[df['prefix']=='label'].copy()
  df_test = df_test.astype(str)
  df_test.reset_index(drop=True, inplace=True)

  # Rename input_text column
  df_test.rename(columns={'input_text':'sentence'}, inplace=True)

  ## Merge with contrast data
  df_test = df_test.merge(df_contrast[['sentence','contrast_example']].copy(), on = ['sentence'], how = 'inner')

  ## Prepare input_text
  df_test['input_text'] = df_test.apply(lambda row: row['prefix'] + " : " + row['contrast_example'], axis=1)
  df_test.drop_duplicates(subset=['sentence'], keep='first', inplace=True)
  return df_test

In [52]:
def get_predictions_tt4(df_test_base, df_contrast, model_path):
  # Get test data
  df_test = data_for_testing_tt_4(df_test_base, df_contrast)
  print("DF Test Shape: ", df_test.shape)
  # Instantiate tokenizer
  tokenizer = T5Tokenizer.from_pretrained('t5-base')
  # create tokenized data - input_text
  test_body_input_ids, test_body_attention_masks = tokenize_corpus(df_test['input_text'].values, tokenizer)
  # create tokenized data - target_text - max_len=2
  test_target_input_ids, test_target_attention_masks = tokenize_corpus(df_test['target_text'].values, tokenizer, max_len=2)
  # create tensor dataset
  test_dataset = get_tensor_dataset(test_body_input_ids, test_body_attention_masks, test_target_input_ids, test_target_attention_masks)
  # create dataloader
  test_dataloader = DataLoader(test_dataset, batch_size=24, shuffle=False)
  # get predictions
  # df2 = pd.DataFrame({'predicted': [], 'actual': []})
  test_stats = []
  # Load model
  model = load_model(model_path)
  test_stats, all_prediction_scores, df2 = testing_tt(model, test_dataloader, tokenizer, test_stats)
  print("Test stats: \n", test_stats)
  print("Predictions shape: ", df2.shape)
  # get probabilties
  # Convert logits to probability
  df_probs = convert_to_prob(all_prediction_scores)
  # Append columns to test
  df_test['predicted'] = df2['predicted'].copy()
  df_test['actual'] = df2['actual'].copy()
  df_test['prob_normal'] = df_probs['prob_normal'].copy()
  df_test['prob_offensive'] = df_probs['prob_offensive'].copy()
  df_test['prob_hate'] = df_probs['prob_hate'].copy()
  return df_test

In [53]:
def data_for_testing_st_4(df_test_base, df_contrast):
  ## Select data-points with prefix = 'label'
  df_test = df_test_base[df_test_base['prefix']=='label'].copy()
  df_test = df_test.astype(str)
  df_test.reset_index(drop=True, inplace=True)

  # Rename input_text column
  df_test.rename(columns={'input_text':'sentence'}, inplace=True)

  ## Merge with contrast data
  df_test = df_test.merge(df_contrast[['sentence','contrast_example']].copy(), on = ['sentence'], how = 'inner')
  df_test.drop_duplicates(subset=['sentence'], keep='first', inplace=True)

  to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df_test["prefix"].tolist(), df_test["contrast_example"].tolist())
    ]
  truth = df_test["target_text"].tolist()
  tasks = df_test["prefix"].tolist()
  return to_predict, truth, tasks

In [54]:
def get_predictions_st4(df_test_base, df_contrast, model_path):
  to_predict, truth, tasks = data_for_testing_st_4(df_test_base, df_contrast)
  model = T5Model("t5", model_path + "/outputs/best_model", args=model_args_cls)

  df_test = df_test_base[df_test_base['prefix']=='label'].copy()

  # Get the model predictions
  preds = model.predict(to_predict)

  ## Taking only the top (first) predictions
  top_preds = [pred[0] for pred in preds]
  df_test["predicted"] = top_preds
  df_test['all_preds'] = preds

  # Classification Metrics
  df_test['actual_label'] = df_test['target_text'].map({'hate':0, 'normal':1, 'offensive':2})
  df_test['predicted_label'] = df_test['predicted'].map({'hate':0, 'normal':1, 'offensive':2})

  # Acc, P, R, F1
  y_true = df_test['actual_label'].values
  y_pred = df_test['predicted_label'].values
  target_names = ['hate','normal','offensive']
  print("Standard Metrics:")
  print(classification_report(y_true, y_pred, target_names=target_names, digits=4))

  return df_test

In [55]:
with torch.no_grad():
  torch.cuda.empty_cache()

In [56]:
iou = get_iou(df_test_kw)
print("IOU: ", np.round(iou,4))
df_test_kw_contrast = create_contrast_example(df_test_kw)

## Torch + Transformers
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent_v2/"
df_test = get_predictions_tt4(df_test_base, df_test_kw_contrast, model_path)
calculate_standard_metrics(df_test)

## SimpleTransformers
model_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/t5_modeling/t5_ip_sent/"
df_test = get_predictions_st4(df_test_base, df_test_kw_contrast, model_path)

IOU:  0.4776
DF Test Shape:  (1761, 7)


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



Running Testing...
  Batch    40  of     74.    Elapsed: 0:00:18.
Test stats: 
 [{'Test Loss': 0.8320149106753839, 'Test PPL.': 2.297944231086512, 'Test Acc.': 0.42154654654654655, 'Test F1': 0.5373458513464926}]
Predictions shape:  (1761, 2)
Standard Metrics:
              precision    recall  f1-score   support

        hate     0.5938    0.0691    0.1238       550
      normal     0.4246    0.9544    0.5877       702
   offensive     0.2857    0.0668    0.1083       509

    accuracy                         0.4214      1761
   macro avg     0.4347    0.3634    0.2733      1761
weighted avg     0.4373    0.4214    0.3042      1761

AUROC:  0.6134921180789935


Generating outputs:   0%|          | 0/56 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/5283 [00:00<?, ?it/s]

Standard Metrics:
              precision    recall  f1-score   support

        hate     0.6977    0.0545    0.1012       550
      normal     0.4301    0.9430    0.5908       702
   offensive     0.3520    0.1238    0.1831       509

    accuracy                         0.4287      1761
   macro avg     0.4933    0.3738    0.2917      1761
weighted avg     0.4911    0.4287    0.3201      1761



### Baseline Accuracy using Majority Voting

In [57]:
df_test_label = df_test_base[df_test_base['prefix']=='label'].copy()
df_test_label.reset_index(drop=True,inplace=True)

count_vals = df_test_label['target_text'].value_counts()
print("Freq: \n", count_vals)
baseline_accuracy = count_vals.max()/df_test_label.shape[0]
print("Baseline Accuracy: ", baseline_accuracy)

Freq: 
 normal       702
hate         550
offensive    509
Name: target_text, dtype: int64
Baseline Accuracy:  0.3986371379897785


### Hugchat Baseline: Classification Metrics from Hugchat Responses

In [58]:
hugchat_responses_path = "/content/drive/MyDrive/CS4NLP-HateXplain/data/rationale_extraction/df_test_rationale_post_processed.csv"
df_test_hugchat = pd.read_csv(hugchat_responses_path)[['id','hugchat_label_processed']]
df_test_hugchat.dropna(inplace=True)
df_test_hugchat = df_test_hugchat.merge(df_test_label, on = ['id'], how = 'inner')
df_test_hugchat['hugchat_predicted'] = df_test_hugchat['hugchat_label_processed'].map({'hate_speech':0, 'normal_speech':1, 'offensive_speech':2})
df_test_hugchat['actual'] = df_test_hugchat['target_text'].map({'hate':0, 'normal':1, 'offensive':2})

target_names = ['hate','normal','offensive']
print("Standard Metrics:")
print(classification_report(df_test_hugchat['actual'].values, df_test_hugchat['hugchat_predicted'].values , target_names=target_names, digits=4))


Standard Metrics:
              precision    recall  f1-score   support

        hate     0.3922    0.9327    0.5522       550
      normal     0.6545    0.3595    0.4641       701
   offensive     0.5000    0.0650    0.1150       508

    accuracy                         0.4537      1759
   macro avg     0.5156    0.4524    0.3771      1759
weighted avg     0.5279    0.4537    0.3908      1759



In [60]:
## Checking hugchat classification metrics on toxic/non-toxic
df_test_hugchat['hugchat_predicted_toxic'] = df_test_hugchat['hugchat_label_processed'].map({'hate_speech':1, 'normal_speech':0, 'offensive_speech':1})
df_test_hugchat['actual_toxic'] = df_test_hugchat['target_text'].map({'hate':1, 'normal':0, 'offensive':1})

target_names = ['non-toxic','toxic']
print("Standard Metrics:")
print(classification_report(df_test_hugchat['actual_toxic'].values, df_test_hugchat['hugchat_predicted_toxic'].values , target_names=target_names, digits=4))


Standard Metrics:
              precision    recall  f1-score   support

   non-toxic     0.6545    0.3595    0.4641       701
       toxic     0.6732    0.8743    0.7607      1058

    accuracy                         0.6691      1759
   macro avg     0.6639    0.6169    0.6124      1759
weighted avg     0.6658    0.6691    0.6425      1759

