# Bias Uncovering Test Case (BTC) for Country-of-origin Bias

In [1]:
import os
import pandas as pd
import numpy as np
import math
import time
import pickle

from metric import evaluate_btc, evaluate_btc_mtnlp
from utils import accuracy
from utils import load_mutant_and_prediction

## Measuring the number of BTCs found by BiasFinder

In [2]:
mutation_tool = "biasfinder"
bias_type = "occupation"
task = "imdb"  # dataset used for fine-tuning
model = "bert-base-uncased"
mutant = "imdb"  # dataset used for generating mutants


df = load_mutant_and_prediction(mutation_tool, model, bias_type, mutant)

df.head()


Unnamed: 0,label,mutant,template,original,gender,template_id,prediction,original_prediction
0,0,THE PLOT: A trucker (Kristofferson) battles a...,THE PLOT: A trucker (Kristofferson) battles <...,THE PLOT: A trucker (Kristofferson) battles a ...,attendant,9884,0,0
1,0,THE PLOT: A trucker (Kristofferson) battles a...,THE PLOT: A trucker (Kristofferson) battles <...,THE PLOT: A trucker (Kristofferson) battles a ...,cashier,9884,0,0
2,0,THE PLOT: A trucker (Kristofferson) battles a...,THE PLOT: A trucker (Kristofferson) battles <...,THE PLOT: A trucker (Kristofferson) battles a ...,teacher,9884,0,0
3,0,THE PLOT: A trucker (Kristofferson) battles a...,THE PLOT: A trucker (Kristofferson) battles <...,THE PLOT: A trucker (Kristofferson) battles a ...,nurse,9884,0,0
4,0,THE PLOT: A trucker (Kristofferson) battles a...,THE PLOT: A trucker (Kristofferson) battles <...,THE PLOT: A trucker (Kristofferson) battles a ...,assistant,9884,0,0


In [3]:
print("Accuracy on mutants: {:.2f}%".format(accuracy(df["label"],df["prediction"])))


Accuracy on mutants: 91.07%


In [4]:
def print_evaluation(evaluation):
    print("# Mutants \t:", evaluation["mutant"])
    print("# Templates \t:", evaluation["template"])
    print("# BTCs \t\t:", evaluation["btc"])

# evaluation = evaluate_btc(df["label"], df["prediction"], df["mutant"], df["template"], "gender", df["gender"])
# print_evaluation(evaluation)

## IMDB

Measure the number of bias uncovering test cases found by BiasFinder on models fine-tuned using IMDB dataset.

In [5]:
models = ["bert-base-uncased", "bert-base-cased", "roberta-base", "xlnet-base-cased",
          "albert-base-v2", "microsoft/mpnet-base", "facebook/muppet-roberta-base", "google/electra-base-generator"]


mutation_tools = ["biasfinder"]
task = "imdb"  # dataset used for fine-tuning
mutant = "imdb"  # dataset used for generating mutants

d = pd.DataFrame(columns=["model", "tool",
                          "template", "mutant", "btc"])

for model in models:
    for mutation_tool in mutation_tools:
        print(model)
        df = load_mutant_and_prediction(
            mutation_tool, model, bias_type, mutant)
        if mutation_tool == "mtnlp" :
            evaluation = evaluate_btc_mtnlp(df)
        else : 
            evaluation = evaluate_btc(df["label"], df["prediction"],
                                  df["mutant"], df["template"], "gender", df["gender"])

        d = d.append(
            {
                "model": model,
                "tool": mutation_tool,
                "template": evaluation['template'],
                "mutant": evaluation['mutant'],
                "btc": evaluation['btc']
            },
            ignore_index=True)

d


bert-base-uncased
bert-base-cased
roberta-base
xlnet-base-cased
albert-base-v2
microsoft/mpnet-base
facebook/muppet-roberta-base
google/electra-base-generator


Unnamed: 0,model,tool,template,mutant,btc
0,bert-base-uncased,biasfinder,14319,1131201,174666
1,bert-base-cased,biasfinder,14319,1131201,200400
2,roberta-base,biasfinder,14319,1131201,134926
3,xlnet-base-cased,biasfinder,14319,1131201,106574
4,albert-base-v2,biasfinder,14319,1131201,184496
5,microsoft/mpnet-base,biasfinder,14319,1131201,79420
6,facebook/muppet-roberta-base,biasfinder,14319,1131201,85256
7,google/electra-base-generator,biasfinder,14319,1131201,118338


## The total number of BTCs found

In [6]:
d.drop(columns=["model"]).groupby("tool").sum()

biasfinder


## Twitter Sentiment 140

Measure the number of bias uncovering test cases found by BiasFinder models fine-tuned using Twitter Sentiment 140 dataset.

In [7]:
models = ["bert-base-uncased", "bert-base-cased", "roberta-base", "xlnet-base-cased",
          "albert-base-v2", "microsoft/mpnet-base", "facebook/muppet-roberta-base", "google/electra-base-generator"]

mutation_tools = ["biasfinder"]

task = "twitter_s140"  # dataset used for fine-tuning
mutant = "twitter_s140"  # dataset used for generating mutants

d = pd.DataFrame(columns=["model", "tool",
                          "template", "mutant", "btc"])

for model in models:
    for mutation_tool in mutation_tools:
        df = load_mutant_and_prediction(
            mutation_tool, model, bias_type, mutant)
        if mutation_tool == "mtnlp":
            evaluation = evaluate_btc_mtnlp(df)
        else:
            evaluation = evaluate_btc(df["label"], df["prediction"],
                                      df["mutant"], df["template"], "gender", df["gender"])

        d = d.append(
            {
                "model": model,
                "tool": mutation_tool,
                "template": evaluation['template'],
                "mutant": evaluation['mutant'],
                "btc": evaluation['btc']
            },
            ignore_index=True)

d


Unnamed: 0,model,tool,template,mutant,btc
0,bert-base-uncased,biasfinder,202,15958,25336
1,bert-base-cased,biasfinder,202,15958,16938
2,roberta-base,biasfinder,202,15958,25656
3,xlnet-base-cased,biasfinder,202,15958,16648
4,albert-base-v2,biasfinder,202,15958,16956
5,microsoft/mpnet-base,biasfinder,202,15958,19026
6,facebook/muppet-roberta-base,biasfinder,202,15958,20098
7,google/electra-base-generator,biasfinder,202,15958,15458


## The total number of BTCs found

In [8]:
d.drop(columns=["model"]).groupby("tool").sum()


biasfinder
