# Bias Uncovering Test Case (BTC) for Gender Bias

In [1]:
import os
import pandas as pd
import numpy as np
import math
import time
import pickle

from metric import evaluate_btc

## Measuring the Performance of the Fine-tuned SA models on the Test set

In [2]:
def load_pickle(fpath):
    with open(fpath, 'rb') as f:
        pred = pickle.load(f)
    return pred

def accuracy(label, prediction):
    return round(100 * sum(label == prediction) / len(label), 2)

def calculate_test_accuracy(task, model) :
    label_path = f"../../asset/{task}/test.csv"
    pred_path = f"../../asset/{task}/predictions/{model}.pkl"


    test_df = pd.read_csv(label_path, header=None, sep="\t")

    test_labels = test_df[0].values
    predicitons = load_pickle(pred_path)

    return accuracy(test_labels, predicitons)


task = "twitter_s140"
model = "bert-base-uncased"


calculate_test_accuracy(task, model)


82.76

## Measuring the BTC that can be found by BiasFinder

In [3]:
mutation_tool = "biasfinder"
bias_type = "gender"
task = "twitter_semeval" # dataset used for fine-tuning
model = "bert-base-uncased"
mutant = "twitter_semeval" # dataset used for generating mutants

def load_mutant_and_prediction(mutation_tool, bias_type, mutant):
    base_dir = f"../../data/{mutation_tool}/{bias_type}/{mutant}/"
    if mutation_tool == "biasfinder" :
        df = pd.read_csv(base_dir + "test.csv", header=None, sep="\t", names=["label", "mutant", "template", "original", "gender"])
    elif mutation_tool == "eec":
        df = pd.read_csv(base_dir + "test.csv", header=None, sep="\t", names=["label", "mutant", "template", "original", "person", "gender", "emotion"])

    df["template"] = df["template"].astype("category")
    df["template_id"] = df["template"].cat.codes

    prediction_fpath = os.path.join(base_dir, f"predictions/{model}.pkl")
    pred = load_pickle(prediction_fpath)

    df["prediction"] = pred

    return df


df = load_mutant_and_prediction(mutation_tool, bias_type, mutant)

df.head()


FileNotFoundError: [Errno 2] No such file or directory: '../../data/biasfinder/gender/twitter_semeval/predictions/bert-base-uncased.pkl'

In [None]:
print("Accuracy on mutants: {:.2f}%".format(accuracy(df["label"],df["prediction"])))


Accuracy on mutants: 84.46%


In [4]:
def print_evaluation(evaluation):
    print("# Mutants \t:", evaluation["mutant"])
    print("# Templates \t:", evaluation["template"])
    print("# BTCs \t\t:", evaluation["btc"])

evaluation = evaluate_btc(df["label"], df["prediction"], df["mutant"], df["template"], "gender", df["gender"])
print_evaluation(evaluation)

NameError: name 'df' is not defined

## Compare the BTCs found by  BiasFinder and EEC

In [None]:
# models = ["bert-base-uncased", "bert-base-cased", "roberta-base", "albert-base-v2", "microsoft/mpnet-base", "microsoft/deberta-base", "facebook/muppet-roberta-base", "google/electra-base-generator"]

d = pd.DataFrame(columns=["tool", "model", "accuracy",
                          "template", "mutant", "btc"])

models = ["bert-base-uncased", "bert-base-cased", "roberta-base", "xlnet-base-cased",
        "albert-base-v2", "microsoft/mpnet-base", "microsoft/deberta-base", 
        "facebook/muppet-roberta-base", "google/electra-base-generator"]

mutation_tool = "biasfinder"
bias_type = "gender"
task = "twitter_s140"  # dataset used for fine-tuning
mutant = "twitter_s140"  # dataset used for generating mutants

for model in models :
    test_accuracy = calculate_test_accuracy(task, model)
    df = load_mutant_and_prediction(mutation_tool, bias_type, mutant)
    evaluation = evaluate_btc(df["label"], df["prediction"],
                            df["mutant"], df["template"], "gender", df["gender"])

    d = d.append(
            {
                "tool": mutation_tool,
                "model": model, 
                "accuracy": test_accuracy,
                "template": evaluation['template'], 
                "mutant": evaluation['mutant'],
                "btc" : evaluation['btc']
            }, 
            ignore_index=True)

mutation_tool = "eec"

for model in models:
    test_accuracy = calculate_test_accuracy(task, model)
    df = load_mutant_and_prediction(mutation_tool, bias_type, mutant)
    evaluation = evaluate_btc(df["label"], df["prediction"],
                              df["mutant"], df["template"], "gender", df["gender"])

    d = d.append(
        {
            "tool": mutation_tool,
            "model": model,
            "accuracy" : test_accuracy,
            "template": evaluation['template'],
            "mutant": evaluation['mutant'],
            "btc": evaluation['btc']
        },
        ignore_index=True)


In [None]:
d

Unnamed: 0,tool,model,template,mutant,btc
0,biasfinder,bert-base-uncased,1672,58388,14824
1,biasfinder,bert-base-cased,1672,58388,14983
2,biasfinder,roberta-base,1672,58388,13439
3,biasfinder,xlnet-base-cased,1672,58388,244792
4,biasfinder,albert-base-v2,1672,58388,9498
5,biasfinder,microsoft/mpnet-base,1672,58388,21406
6,biasfinder,microsoft/deberta-base,1672,58388,15194
7,biasfinder,facebook/muppet-roberta-base,1672,58388,17124
8,biasfinder,google/electra-base-generator,1672,58388,11367
9,eec,bert-base-uncased,140,8400,2132
