# Bias mitigated Toxicity predictions with RoBERTa
# Necessary files: roberta2_finetuned (folder), dataset.py, all_data.csv
#### Running times are a main concern for later expanding how much data we use, but for now will use this as basis for building some bias detection 

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import re
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import torch.nn as nn
import torch.nn.functional as f
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

import dataset

In [4]:

df = pd.read_csv('all_data.csv')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)  # Regression task

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [6]:
df['comment_text'] = df['comment_text'].fillna('') # Remove NaN values
df['cleaned_comment'] = df['comment_text'].apply(clean_text) 

In [7]:
df['target'] = df['toxicity'] # Mark target col
data = dataset.split_dataframe(df)

# Assign dfs
traindf = data[0]
testdf = data[1]

29992 159782


In [8]:
# Using smaller subset 
trainsubset = traindf[:25000]
testsubset = testdf[:2500]

In [9]:
# Treat NaN values in identity cols
identities = ['male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability']

trainsubset[identities] = trainsubset[identities].fillna(0.0)

testsubset[identities] = testsubset[identities].fillna(0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trainsubset[identities] = trainsubset[identities].fillna(0.0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testsubset[identities] = testsubset[identities].fillna(0.0)


In [None]:

def adversarialloss(preds, identities_col_agg):
    return f.mse_loss(preds.flatten(), identities_col_agg.flatten())

def biasloss(modelchoice, input_ids, attention_mask, labels, identities_col, bias_penalty_factor=0.5):
    '''Calculates loss based on model's toxicity prediction with the bias penalty 
     bias_penalty_factor: factor for how much to penalize bias '''
    logits = modelchoice(input_ids, attention_mask=attention_mask).logits
    loss = f.mse_loss(logits.flatten(), labels)
    # Adversarial loss
    adversary_logits = modelchoice(input_ids, attention_mask=attention_mask).logits  
    adversary_loss = adversarialloss(adversary_logits, identities_col)
    
    # Calculate with adversarial penalty
    total_loss = loss - bias_penalty_factor * adversary_loss
    
    return total_loss
def train_with_penalty(model, train_dataloader, optimizer, device, bias_penalty_factor=0.5):
    ''' Training roberta model with adversarial loss to address bias'''
    
    model.train()
    
    for epoch in range(5):
        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()  # Clear gradients

            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            identities_col = batch[3].to(device)  
                        
            # Compute the loss with bias regularization
            loss = biasloss(model, input_ids, attention_mask, labels, identities_col, bias_penalty_factor)
            loss.backward() 
            optimizer.step()

    print(f"Epoch {epoch + 1}: Loss {loss.item()}")

Xtrain_r2 = list(trainsubset['cleaned_comment'])
ytrain_r2 = list(trainsubset['toxicity'])
Xtest_r2 = list(testsubset['cleaned_comment'])
ytest_r2 = list(testsubset['toxicity'])

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta2 = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)  # Regression task

Xtrain_encodings = tokenizer(Xtrain_r2, truncation=True, padding=True, max_length=200, return_tensors='pt') # choosing max length of comment
Xtest_encodings = tokenizer(Xtest_r2, truncation=True, padding=True, max_length=200, return_tensors='pt')

# Convert to tensors to prepare for dataloader 
ytrain_tensor = torch.tensor(ytrain_r2, dtype=torch.float)
ytest_tensor = torch.tensor(ytest_r2, dtype=torch.float)

identities_col_train = torch.tensor(trainsubset[identities].values, dtype=torch.float)
identities_col_test = torch.tensor(testsubset[identities].values, dtype=torch.float)

# Penalizing the loss on any identity label presence or absence of minority and adjust the loss accordingly
identities_col_train_agg = (identities_col_train.any(dim=1).float()).unsqueeze(1)
identities_col_test_agg  = (identities_col_test.any(dim=1).float()).unsqueeze(1)


Xtraintorch = TensorDataset(Xtrain_encodings['input_ids'], Xtrain_encodings['attention_mask'], ytrain_tensor, identities_col_train_agg)
Xtesttorch = TensorDataset(Xtest_encodings['input_ids'], Xtest_encodings['attention_mask'], ytest_tensor, identities_col_test_agg)
# Try different batch size to reduce running time



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Apply bias mitigation with adversarial loss to data

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    roberta2 = nn.DataParallel(roberta2)

roberta2 = roberta2.to(device)
optimizer = torch.optim.AdamW(roberta2.parameters(), lr=1e-5)

In [10]:
with torch.no_grad():
    torch.cuda.empty_cache()


In [15]:
train_dataloader = DataLoader(Xtraintorch, batch_size=164, shuffle=True)
test_dataloader = DataLoader(Xtesttorch, batch_size=164, shuffle=False)
# train_with_penalty(roberta2, train_dataloader, optimizer, device, bias_penalty_factor=0.5)


In [None]:
# Save model
#  roberta2.module.save_pretrained('roberta2_finetuned')

In [None]:
## For inference
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
inference_model = RobertaForSequenceClassification.from_pretrained('roberta2_finetuned').to(device)

def evaluate_model(model, test_dataloader):
    ''' Evaluate model on test data using same framework as the training loop
    Params: model: torch.nn.Module, test_dataloader: torch.dataloader 
    Output: tuple: pred (list) of predicted toxicity scores for the test data, actual (list) of true scores
    '''
    model.eval()
    pred = []
    actual = []
    
    with torch.no_grad(): # no gradient calculation for faster running 
        for batch in tqdm(test_dataloader):
            # Get the input data and labels from the batch
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            # Forward pass: Compute predictions
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            pred.extend(logits.cpu().numpy())
            actual.extend(labels.cpu().numpy()) 

    return pred, actual

pred2, actual2 = evaluate_model(inference_model, test_dataloader)

100%|██████████| 16/16 [00:05<00:00,  2.93it/s]


In [None]:
mse2 = mean_squared_error(actual2, pred2)
print(f'Mean Squared Error for Roberta bias mitigated model using 25000/2500 data split: {mse2}')


Mean Squared Error for Roberta bias mitigated model using 25000/2500 data split: 0.37158588833321393


In [30]:
binary_preds2 = [1 if pred>=0.4 in pred2 else 0 for pred in pred2]
binary_labels2 = [1 if label>=0.4 in actual2 else 0 for label in actual2]

print(f'Accuracy for Bias-mitigated Binary Classification with threshold of '
        f'0.4 : {100 * accuracy_score(binary_labels2, binary_preds2):0.2f}%')


Accuracy for Bias-mitigated Binary Classification with threshold of 0.4 : 100.00%
