In [None]:
# fine-tunes BERT using 'unbiased' data (accuracy 0.749)

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

df = pd.read_csv('thesis_dataset.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# function to preprocess data (with padding)
def preprocess_data(df, tokenizer, max_length=128):
    tokenized_texts = []
    labels = []
    genders = []
    for index, row in df.iterrows():
        tokens = tokenizer.encode(row['Name'] + ' ' + row['Education']+ ' ' + row['Work Experience'] + ' ' + row['Skills'] + ' ' + row['Job'])
        if len(tokens) < max_length:
            tokens = tokens + [tokenizer.pad_token_id] * (max_length - len(tokens))
        else:
            tokens = tokens[:max_length]
        tokenized_texts.append(tokens)
        
        # label -> numerical
        if row['Fit'] == 'very bad':
            label = 0
        elif row['Fit'] == 'bad':
            label = 1
        elif row['Fit'] == 'average':
            label = 2
        elif row['Fit'] == 'good':
            label = 3
        else:
            label = 4
        labels.append(label)
        
        genders.append(row['Gender'])

    return tokenized_texts, labels, genders

# split dataset
train_df, val_df = train_test_split(df, train_size=0.8, random_state=42)

# preprocess data
train_tokenized_texts, train_true_labels, train_sensitive_attribute = preprocess_data(train_df, tokenizer)
val_tokenized_texts, val_true_labels, val_sensitive_attribute = preprocess_data(val_df, tokenizer)

# convert to tensors
train_input_ids = torch.tensor(train_tokenized_texts)
train_true_labels = torch.tensor(train_true_labels)
val_input_ids = torch.tensor(val_tokenized_texts)
val_true_labels = torch.tensor(val_true_labels)

# dataloaders for training + validation sets
batch_size = 8
train_dataset = TensorDataset(train_input_ids, train_true_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(val_input_ids, val_true_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# results for bias metrics
val_preds = []
val_labels = []
val_genders = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# fine-tuning
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    # validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs, labels = batch
            outputs = model(inputs, labels=labels)
            val_loss = outputs.loss
            val_losses.append(val_loss.item())
            logits = outputs.logits
            val_preds.extend(logits.argmax(dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

            genders = val_sensitive_attribute[:len(logits)]  # ?
            val_genders.extend(genders)

    avg_val_loss = sum(val_losses) / len(val_losses)
    val_accuracy = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch + 1}/{epochs}:")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")

model.save_pretrained('fine_tuned_bert_model')

val_preds = np.array(val_preds)
val_labels = np.array(val_labels)
val_genders = np.array(val_genders)

last_val_preds = val_preds[-300:]
last_val_labels = val_labels[-300:]
last_val_genders = val_genders[-300:]



In [None]:
# fine-tunes BERT using biased data (accuracy 0.692)

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

df = pd.read_csv('biased.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# function to preprocess data (with padding)
def preprocess_data(df, tokenizer, max_length=128):
    tokenized_texts = []
    labels = []
    genders = []
    for index, row in df.iterrows():
        tokens = tokenizer.encode(row['Name'] + ' ' + row['Education']+ ' ' + row['Work Experience'] + ' ' + row['Skills'] + ' ' + row['Job'])
        if len(tokens) < max_length:
            tokens = tokens + [tokenizer.pad_token_id] * (max_length - len(tokens))
        else:
            tokens = tokens[:max_length]
        tokenized_texts.append(tokens)
        
        # label -> numerical
        if row['Fit'] == 'very bad':
            label = 0
        elif row['Fit'] == 'bad':
            label = 1
        elif row['Fit'] == 'average':
            label = 2
        elif row['Fit'] == 'good':
            label = 3
        else:
            label = 4
        labels.append(label)
        
        genders.append(row['Gender'])

    return tokenized_texts, labels, genders

# split data
train_df, val_df = train_test_split(df, train_size=0.8, random_state=42)

# preprocess data
train_tokenized_texts, train_true_labels, train_sensitive_attribute = preprocess_data(train_df, tokenizer)
val_tokenized_texts, val_true_labels, val_sensitive_attribute = preprocess_data(val_df, tokenizer)

# convert to tensors
train_input_ids = torch.tensor(train_tokenized_texts)
train_true_labels = torch.tensor(train_true_labels)
val_input_ids = torch.tensor(val_tokenized_texts)
val_true_labels = torch.tensor(val_true_labels)

# dataloaders for training + validation sets
batch_size = 8
train_dataset = TensorDataset(train_input_ids, train_true_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(val_input_ids, val_true_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# results for bias metrics
val_preds_biased = []
val_labels_biased = []
val_genders_biased = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# fine-tuning
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)

    # validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs, labels = batch
            outputs = model(inputs, labels=labels)
            val_loss = outputs.loss
            val_losses.append(val_loss.item())
            logits = outputs.logits
            val_preds_biased.extend(logits.argmax(dim=1).cpu().numpy())
            val_labels_biased.extend(labels.cpu().numpy())

            genders = val_sensitive_attribute[:len(logits)] # ?
            val_genders_biased.extend(genders)

    avg_val_loss = sum(val_losses) / len(val_losses)
    val_accuracy = accuracy_score(val_labels_biased, val_preds_biased)

    print(f"Epoch {epoch + 1}/{epochs}:")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")

model.save_pretrained('fine_tuned_bert_model')

val_preds_biased = np.array(val_preds_biased)
val_labels_biased = np.array(val_labels_biased)
val_genders_biased = np.array(val_genders_biased)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5:
  Training Loss: 0.0000
  Validation Loss: 1.2776
  Validation Accuracy: 0.4967
Epoch 2/5:
  Training Loss: 0.0000
  Validation Loss: 0.8436
  Validation Accuracy: 0.5767
Epoch 3/5:
  Training Loss: 0.0000
  Validation Loss: 0.7012
  Validation Accuracy: 0.6300
Epoch 4/5:
  Training Loss: 0.0000
  Validation Loss: 0.6639
  Validation Accuracy: 0.6650
Epoch 5/5:
  Training Loss: 0.0000
  Validation Loss: 0.5271
  Validation Accuracy: 0.6920


In [9]:
last_val_preds_biased = val_preds_biased[-300:]
last_val_labels_biased = val_labels_biased[-300:]
last_val_genders_biased = val_genders_biased[-300:]


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [3]:
import numpy as np

def demographic_parity_difference(predictions, sensitive_attribute):

    # convert preds and sensitive attribute to arrays
    predictions = np.array(predictions)
    sensitive_attribute = np.array(sensitive_attribute)
    
    # get labels + sensitive attribute vals
    unique_labels = np.unique(predictions)
    unique_sensitive_attribute = np.unique(sensitive_attribute)

    print(unique_labels)
    # print(sensitive_attribute)

    # store dpd
    dp_diff = np.zeros(len(unique_labels))
    privileged_gender = np.empty(len(unique_labels), dtype=object)

    
    # compute dpd for each label
    for i, label in enumerate(unique_labels):
        # find indices where pred is equal to current label
        label_indices = np.where(predictions == label)[0]
        
        # compute proportions of positive outcomes for each group
        group_proportions = []
        for a in unique_sensitive_attribute:
            # find indices where sensitive attribute is equal to current value
            group_indices = np.where(sensitive_attribute == a)[0]
            
            # compute proportion of positive outcomes for this group
            positive_proportion = np.mean(predictions[group_indices] == label)
            group_proportions.append(positive_proportion)
        
        # compute dpd for this label
        dp_diff[i] = max(group_proportions) - min(group_proportions)

    
        # ID privileged gender for this label
        privileged_index = np.argmax(group_proportions)
        privileged_gender[i] = unique_sensitive_attribute[privileged_index]
    
    return dp_diff, privileged_gender



In [4]:
def equal_opportunity_difference(predictions, true_labels, sensitive_attribute):

    # convert preds and sensitive attribute to arrays
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    sensitive_attribute = np.array(sensitive_attribute)
    
    # get labels + sensitive attribute vals
    unique_labels = np.unique(predictions)
    unique_sensitive_attribute = np.unique(sensitive_attribute)
    
    # store eod
    eo_diff = np.zeros(len(unique_labels))
    privileged_gender = np.empty(len(unique_labels), dtype=object)
    
    # compute eod for each label
    for i, label in enumerate(unique_labels):
        # find indices where pred equal to current label
        label_indices = np.where(predictions == label)[0]
        
        # compute TPR for each group
        tpr_group = []
        for a in unique_sensitive_attribute:
            # find indices where sensitive attribute equal to current value
            group_indices = np.where(sensitive_attribute == a)[0]
            
            # compute TPR for this group
            true_positives = np.sum((predictions[group_indices] == label) & (true_labels[group_indices] == label))
            actual_positives = np.sum(true_labels[group_indices] == label)
            
            if actual_positives > 0:
                true_positive_rate = true_positives / actual_positives
                tpr_group.append(true_positive_rate)
            else:
                tpr_group.append(0.0)  # division by zero case
        
        # compute eod for this label
        eo_diff[i] = max(tpr_group) - min(tpr_group)
        
        # ID privileged gender for this label
        privileged_index = np.argmax(tpr_group)
        privileged_gender[i] = unique_sensitive_attribute[privileged_index]
    
    return eo_diff, privileged_gender

In [5]:
def average_odds_difference(predictions, true_labels, sensitive_attribute):

    # convert preds and sensitive attribute to arrays
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)
    sensitive_attribute = np.array(sensitive_attribute)
    
    # get labels + sensitive attribute vals
    unique_labels = np.unique(predictions)
    unique_sensitive_attribute = np.unique(sensitive_attribute)
    
    # store aod
    aod_diff = np.zeros(len(unique_labels))
    privileged_gender = np.empty(len(unique_labels), dtype=object)
    
    # compute aod for each label
    for i, label in enumerate(unique_labels):
        # Find indices where prediction is equal to the current label
        label_indices = np.where(predictions == label)[0]
        
        # compute FPR and TPR for each group
        fpr_group = []
        tpr_group = []
        for a in unique_sensitive_attribute:
            # find indices where sensitive attribute equal to current val
            group_indices = np.where(sensitive_attribute == a)[0]
            
            # compute FPR for this group
            false_positives = np.sum((predictions[group_indices] == label) & (true_labels[group_indices] != label))
            actual_negatives = np.sum(true_labels[group_indices] != label)
            if actual_negatives > 0:
                false_positive_rate = false_positives / actual_negatives
                fpr_group.append(false_positive_rate)
            else:
                fpr_group.append(0.0)  # division by zero case
            
            # compute TPR for this group
            true_positives = np.sum((predictions[group_indices] == label) & (true_labels[group_indices] == label))
            actual_positives = np.sum(true_labels[group_indices] == label)
            if actual_positives > 0:
                true_positive_rate = true_positives / actual_positives
                tpr_group.append(true_positive_rate)
            else:
                tpr_group.append(0.0)  # division by zero case
        
        # compute AOD for this label
        aod_diff[i] = (max(fpr_group) - min(fpr_group) + max(tpr_group) - min(tpr_group)) / 2
        
        # ID privileged gender for this label
        privileged_index = np.argmax(tpr_group)
        privileged_gender[i] = unique_sensitive_attribute[privileged_index]
    
    return aod_diff, privileged_gender

In [6]:
def disparate_impact(predictions, sensitive_attribute):

    # convert preds and sensitive attribute to arrays
    predictions = np.array(predictions)
    sensitive_attribute = np.array(sensitive_attribute)
    
    # get labels + sensitive attribute vals
    unique_labels = np.unique(predictions)
    unique_sensitive_attribute = np.unique(sensitive_attribute)
    
    # store di ratio
    di_ratio = np.zeros(len(unique_labels))
    privileged_gender = np.empty(len(unique_labels), dtype=object)
    
    # compute di ratio for each label
    for i, label in enumerate(unique_labels):
        # Find indices where prediction is equal to the current label
        label_indices = np.where(predictions == label)[0]
        
        # compute proportions of positive outcomes for each group
        favorable_proportions = []
        for a in unique_sensitive_attribute:
            # find indices where sensitive attribute equal to current val
            group_indices = np.where(sensitive_attribute == a)[0]
            
            # compute proportion of positive outcomes for this group
            favorable_proportion = np.mean(predictions[group_indices] == label)
            favorable_proportions.append(favorable_proportion)
        
        # compute di ratio for this label
        di_ratio[i] = favorable_proportions[1] / favorable_proportions[0] if favorable_proportions[0] != 0 else 0
        
        # ID privileged gender for this label
        privileged_index = np.argmax(favorable_proportions)
        privileged_gender[i] = unique_sensitive_attribute[privileged_index]
    
    return di_ratio, privileged_gender

In [7]:
# bias metrics for BERT fine-tuned with unbiased data

dp_diff = demographic_parity_difference(last_val_preds, last_val_genders)
print("Demographic parity difference for each label:", dp_diff)

eo_diff = equal_opportunity_difference(last_val_preds, last_val_labels, last_val_genders)
print("Equal opportunity difference for each label:", eo_diff)

ao_diff = average_odds_difference(last_val_preds, last_val_labels, last_val_genders)
print("Average odds difference for each label:", ao_diff)

di_ratio = disparate_impact(last_val_preds, last_val_genders)
print("Disparate impact for each label:", di_ratio)

## acc after 5 epochs = 0.7490

[0 1 2 3 4]
Demographic parity difference for each label: (array([0.05777778, 0.04      , 0.06222222, 0.02222222, 0.05777778]), array(['female', 'male', 'female', 'male', 'male'], dtype=object))
Equal opportunity difference for each label: (array([0.06111111, 0.10541311, 0.25641026, 0.11627907, 0.11713287]), array(['female', 'female', 'female', 'female', 'male'], dtype=object))
Average odds difference for each label: (array([0.04015152, 0.05926192, 0.14164599, 0.07515478, 0.06961616]), array(['female', 'female', 'female', 'female', 'male'], dtype=object))
Disparate impact for each label: (array([0.79365079, 1.25      , 0.70833333, 1.10416667, 1.43333333]), array(['female', 'male', 'female', 'male', 'male'], dtype=object))


In [11]:
# bias metrics for BERT fine-tuned with biased data

dp_diff = demographic_parity_difference(last_val_preds_biased, last_val_genders_biased)
print("Demographic parity difference for each label:", dp_diff)

eo_diff = equal_opportunity_difference(last_val_preds_biased, last_val_labels_biased, last_val_genders_biased)
print("Equal opportunity difference for each label:", eo_diff)

ao_diff = average_odds_difference(last_val_preds_biased, last_val_labels_biased, last_val_genders_biased)
print("Average odds difference for each label:", ao_diff)

di_ratio = disparate_impact(last_val_preds_biased, last_val_genders_biased)
print("Disparate impact for each label:", di_ratio)

## acc after 5 epochs = 0.6920

[0 1 2 3 4]
Demographic parity difference for each label: (array([0.03111111, 0.05333333, 0.02222222, 0.02222222, 0.02222222]), array(['female', 'male', 'male', 'female', 'female'], dtype=object))
Equal opportunity difference for each label: (array([0.05892256, 0.08974359, 0.06730769, 0.03935599, 0.13766234]), array(['female', 'female', 'male', 'female', 'female'], dtype=object))
Average odds difference for each label: (array([0.04634311, 0.05364372, 0.05474916, 0.03766807, 0.07376538]), array(['female', 'female', 'male', 'female', 'female'], dtype=object))
Disparate impact for each label: (array([0.87719298, 1.33333333, 1.15151515, 0.9122807 , 0.88095238]), array(['female', 'male', 'male', 'female', 'female'], dtype=object))
