In [None]:
# Instructions how to generate bios dataset can be found in this repository
# https://github.com/Microsoft/biosbias

In [None]:
import pickle
import shutil
from pathlib import Path
import numpy as np
from collections import Counter

In [None]:
### Configure manually ###

bios_pkl_path = "/share/cp/datasets/nlp/text_classification_bias/bios/BIOS.pkl" # path to BIOS.pkl. Instructions how to generate see above

val_size = 0.2
test_size = 0.25

seed = 0

##########################

In [None]:
rng = np.random.default_rng(seed)

In [None]:
data_folder = Path("data")
data_folder.mkdir(exist_ok=True)
shutil.copyfile(bios_pkl_path, data_folder / "BIOS.pkl")

In [None]:
with open(data_folder / "BIOS.pkl", "rb") as f:
    bios_dicts = pickle.load(f)

In [None]:
combs = [(x["title"], x["gender"]) for x in bios_dicts]
titles, genders = [set(x) for x in zip(*combs)]

In [None]:
title_dict = {}
title_idx = {}
for i, (title, gender) in enumerate(combs):
    try:
        title_dict[title].append(gender)
        title_idx[title].append(i)
    except KeyError:
        title_dict[title] = [gender]
        title_idx[title] = [i]

In [None]:
for t, g in title_dict.items():
    print(f"{t}:\n{Counter(g)}")

In [None]:
idx_list = []
for title, indices in title_idx.items():
    idx_val = int(len(indices) * val_size)
    idx_test = int(len(indices) * test_size)
    x = rng.permutation(np.array(indices))
    idx_list.append(np.split(x, [idx_val, idx_test+idx_val]))
val, test, train = [[bios_dicts[i] for i in np.concatenate(x)] for x in zip(*idx_list)]

In [None]:
split_counts = []
for l in [val, test, train]:
    tg = {}
    for d in l:
        try:
            tg[d["title"]].append(d["gender"])
        except KeyError:
            tg[d["title"]] = [d["gender"]]
    split_counts.append({t:dict(Counter(x)) for t,x in tg.items()})

In [None]:
for title in titles:
    print(title)
    for i in range(3):
        print(split_counts[i][title])

In [None]:
train_counts = split_counts[2]
add_samples = []
for title in titles:
    title_samples = [x for x in train if x["title"]==title]
    title_counts = train_counts[title]
    max_val = max(title_counts.values())
    n_samples_to_add = {g:max_val-v for g,v in title_counts.items()}
    for g, add in n_samples_to_add.items():
        all_samples = [x for x in title_samples if x["gender"]==g]
        while add>title_counts[g]:
            add_samples.extend(all_samples)
            add -= title_counts[g]
        add_samples.extend(
            [all_samples[i] for i in rng.permutation(np.arange(len(all_samples)))[:add]]
        )
train_balanced = train + add_samples

In [None]:
for ds in ["train", "train_balanced", "val", "test"]:
    with open(data_folder / f'{ds}.pkl', 'wb') as f:
        pickle.dump(eval(ds), f)

In [None]:
with open(data_folder / "labels_task.txt", "w") as f:
    for title in titles:
        f.write(title + "\n")

In [None]:
with open(data_folder / "labels_protected_gender.txt", "w") as f:
    for gender in genders:
        f.write(gender + "\n")