# Prepare CSV files for the user study

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
random_seed = 123

In [3]:
def load_mutant(mutation_tool, bias_type, mutant):
    base_dir = f"../../data/{mutation_tool}/{bias_type}/{mutant}/"
    if mutation_tool == "biasfinder":
        headers = ["label", "mutant", "template", "original", "gender"]
    elif mutation_tool == "eec":
        headers = ["label", "mutant", "template",
                   "original", "person", "gender", "emotion"]
    elif mutation_tool == "mtnlp":
        headers = ["label", "mutant", "original",
                   "template", "gender", "mutation_type"]
    else:
        raise ValueError("Unknown mutation tool")

    df = pd.read_csv(base_dir + "test.csv", header=None,
                     sep="\t", names=headers)

    return df


In [4]:
mutation_tool = "biasfinder"
bias_type = "gender"
data = ["imdb", "twitter_s140"]


originals = []
templates = []
mutants = []
n = 0
for dataset_name in data :
    d = load_mutant(mutation_tool, bias_type, dataset_name)
    n += len(d)
    originals.extend(d["original"].values)
    templates.extend(d["template"].values)
    mutants.extend(d["mutant"].values)

make_senses = [""] * n

df = pd.DataFrame(data={"original_text": originals, "template":templates, "muated_text": mutants})
print(len(df))
print(n)

df = df.sample(n=n, random_state=random_seed)
df.head()

KeyError: 1

In [62]:
sample_size = 400
sample = df[:sample_size]
sample["is_make_sense"] = ""
sample = sample.reset_index(drop=True)
sample

Unnamed: 0,text,is_make_sense
0,"Listenin to Dolla ... Jorge , u will b missed ...",
1,"This movie is from the 80s, but it looks like ...",
2,Dirty Dancing follows the story of Frances ' B...,
3,I have been a huge Arnold fan ever since his b...,
4,I have seen cheesy kung fu fight films. Living...,
...,...,...
395,yay my good buddie Duane has given me a free t...,
396,how can we say to a 18 years old sister that s...,
397,I wish they would just make a special section ...,
398,"Jared so stupid, he got stabbed in a shoot out.",


In [63]:
fpath = f"../../user_study/TSE_revised/{mutation_tool}/{bias_type}-unlabelled.csv"
sample.to_csv(fpath)

In [66]:
## Automate all data
sample_size = 400

tools = ["biasfinder", "mtnlp"]
tool2types = {"biasfinder": ["gender", "occupation", "country"], "mtnlp": ["gender"]}

data = ["imdb", "twitter_s140"]


for mutation_tool in tools :
    for bias_type in tool2types[mutation_tool] :

        mutants = []
        n = 0
        for dataset_name in data:
            label_path = f"../../data/{mutation_tool}/{bias_type}/{dataset_name}/test.csv"
            d = pd.read_csv(label_path, header=None, sep="\t")
            n += len(d)
            mutants.extend(d[1].values)

        make_senses = [""] * n

        df = pd.DataFrame(data={"text": mutants})
        
        df = df.sample(n=n, random_state=random_seed)
        df.head()

        sample = df[:sample_size]
        sample["is_make_sense"] = ""
        sample = sample.reset_index(drop=True)
        
        fpath = f"../../user_study/TSE_revised/{mutation_tool}/{bias_type}-unlabelled.csv"
        sample.to_csv(fpath)


217030
217030
1153321
1153321
94700
94700
310619
310619
