In [5]:
import ast

import copy

import os

from typing import List, Tuple

import numpy as np
import pandas as pd 

from scipy.stats import kruskal

from sklearn.metrics import f1_score, confusion_matrix

from tqdm import tqdm

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
data_dir = "../output/used_results/"


resamples = 1000

In [9]:
def open_and_clean_file(file: str) -> Tuple[List[int], List[int], List[int]]:

    overall_demographics =  {"aae": ['aa', 'wh'], "hatexplain-race": ["African", "Arab", "Asian", "Hispanic", "Caucasian"], "bias": ["m", "f"]}
    labels_map = {"aae" : {"happy": 1, "sad": 0, "fail________" : -1}, "hatexplain-race": {"yes" : 1, "no" : 0, "fail________" : -1}, "bias" : {"attorney": 0, "photographer": 1, "dentist" : 2, "psychologist" : 3, "physician": 4, "journalist" : 5, "teacher": 6, "professor": 7, "fail________" : -1}}

    results = pd.read_csv(file)

    if "flan" not in file and "ul2" not in file and "gpt" not in file:
        results['prompt_len'] = results.prompt.str.len()
        results['response'] = results.apply(lambda x: x.response[x.prompt_len:], axis=1)

    responses = results['response'].fillna("").tolist()
    labels = results['label'].tolist()

    demographics = [ast.literal_eval(x) for x in results['demographic'].tolist()]

    model_name, dataset, demonstration = os.path.basename(file).split("_")
    demonstration = demonstration[:-4]

    dataset_overall_demographics = overall_demographics[dataset]

    responses = [x.lower() for x in responses]

    conv = lambda i: i or ""
    responses = [conv(i) for i in responses]

    if dataset == "bias":
        responses = [x.replace("lawyer", "attorney") for x in responses]

    # map labels to numbers to make it easier for sklearn calculations
    labels_dict = labels_map[dataset]

    labels_set = list(labels_dict.keys())

    # map the labels lists to dummy labels
    dummy_labels = [labels_dict[x] for x in labels]

    dummy_preds = []

    for pred in responses:
        # see if any of the labels are in the response
        for label in labels_set:
            if pred.find(label) != -1:
                dummy_preds.append(labels_dict[label])
                break
            # if not we add -1 instead
        else:
            dummy_preds.append(-1)

    dummy_preds = np.array(dummy_preds)
    dummy_labels = np.array(dummy_labels)

    demographic_index = [
        i
        for i, item in enumerate(demographics)
        if len(set(dataset_overall_demographics).intersection(set(item))) != 0
    ]

    dummy_preds = dummy_preds[demographic_index]
    dummy_labels = dummy_labels[demographic_index]

    demographics_filtered = copy.deepcopy([demographics[i] for i in demographic_index])

    return dummy_preds, dummy_labels, demographics_filtered, dataset_overall_demographics, labels_dict
    

# KWH Over F1 Demographics

In [10]:
folders = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]

for folder in folders:
    files = [os.path.join(folder, x) for x in os.listdir(folder) if x.find(".csv") != -1]

    for file in files:

        dummy_preds, dummy_labels, demographics_filtered, dataset_overall_demographics, labels_dict = open_and_clean_file(file)

        model_name, dataset, demonstration = os.path.basename(file).split("_")
        demonstration = demonstration[:-4]

        shots = 10 if demonstration != "zeroshot" else 0

        demographic_samples = []

        for dem in dataset_overall_demographics:
            index = [i for i, item in enumerate(demographics_filtered) if dem in item]

            preds_to_bootstrap = dummy_preds[index]
            labels_to_bootstrap = dummy_labels[index]

            scores = []

            for i in range(resamples):
                sample_index = np.random.choice(len(preds_to_bootstrap), size=len(preds_to_bootstrap), replace=True)

                score = f1_score(
                    preds_to_bootstrap[sample_index],
                    labels_to_bootstrap[sample_index],
                    average="macro",
                    labels=list(labels_dict.values()),
                )

                scores.append(score*100)
            demographic_samples.append(scores)
        
        print(model_name, dataset, shots, demonstration, kruskal(*demographic_samples))

offline-ul2 hatexplain-race 10 diversity KruskalResult(statistic=4095.353936641809, pvalue=0.0)
offline-ul2 aae 10 similarity KruskalResult(statistic=1499.250377061469, pvalue=0.0)
offline-ul2 bias 10 random KruskalResult(statistic=6.7752116581705195, pvalue=0.009243252894193957)
offline-ul2 hatexplain-race 0 zeroshot KruskalResult(statistic=1554.3151540817005, pvalue=0.0)
offline-ul2 bias 10 similarity KruskalResult(statistic=1448.125594008996, pvalue=0.0)
offline-ul2 hatexplain-race 10 stratified KruskalResult(statistic=3896.086086337278, pvalue=0.0)
offline-ul2 bias 0 zeroshot KruskalResult(statistic=1499.2503748125928, pvalue=0.0)
offline-ul2 hatexplain-race 10 within KruskalResult(statistic=4079.9133408474127, pvalue=0.0)
offline-ul2 hatexplain-race 10 random KruskalResult(statistic=4053.628325780129, pvalue=0.0)
offline-ul2 aae 0 zeroshot KruskalResult(statistic=1491.5421671304339, pvalue=0.0)
offline-ul2 aae 10 random KruskalResult(statistic=1491.1295403747422, pvalue=0.0)
offli

KeyboardInterrupt: 

# KWH Over Demonstrations

In [None]:
from collections import defaultdict

folders = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]

model_dataset_pairs = defaultdict(list)

for folder in folders:
    files = [os.path.join(folder, x) for x in os.listdir(folder) if x.find(".csv") != -1]
    
    for file in tqdm(files):

        model_name, dataset, demonstration = os.path.basename(file).split("_")

        demonstration = demonstration[:-4]

        if demonstration in ["diversity", "random", "similarity", "stratified", "within", "zeroshot"]:

            dummy_preds, dummy_labels, demographics_filtered, dataset_overall_demographics, labels_dict = open_and_clean_file(file)

            model_name, dataset, demonstration = os.path.basename(file).split("_")

            scores = []

            for i in range(resamples):
                sample_index = np.random.choice(len(preds_to_bootstrap), size=len(preds_to_bootstrap), replace=True)

                score = f1_score(
                    preds_to_bootstrap[sample_index],
                    labels_to_bootstrap[sample_index],
                    average="macro",
                    labels=list(labels_dict.values()),
                )

                scores.append(score*100)
            model_dataset_pairs[(model_name, dataset)].append(scores)

In [None]:
for key in model_dataset_pairs:

    model, dataset = key

    print(model, dataset, kruskal(*model_dataset_pairs[key]))

# KWH Over Demographic Recall

In [None]:
folders = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
import csv
with open("recall3.csv","w",) as csvfile:
    csvwriter = csv.writer(csvfile)
    for folder in folders:
        files = [os.path.join(folder, x) for x in os.listdir(folder) if x.find(".csv") != -1]

        for file in files:
            dummy_preds, dummy_labels, demographics_filtered, dataset_overall_demographics, labels_dict = open_and_clean_file(file)

            model_name, dataset, demonstration = os.path.basename(file).split("_")
            demonstration = demonstration[:-4]

            shots = 10 if demonstration != "zeroshot" else 0

            demographic_samples = []

            for dem in dataset_overall_demographics:
                index = [i for i, item in enumerate(demographics_filtered) if dem in item]

                preds_to_bootstrap = dummy_preds[index]
                labels_to_bootstrap = dummy_labels[index]

                scores = []

                for i in range(resamples):
                    sample_index = np.random.choice(len(preds_to_bootstrap), size=len(preds_to_bootstrap), replace=True)

                    cnf_matrix = confusion_matrix(preds_to_bootstrap[sample_index], labels_to_bootstrap[sample_index], labels=list(labels_dict.values()))

                    fn = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
                    tp = np.diag(cnf_matrix)

                    fn = fn.astype(float)
                    tp = tp.astype(float)

                    recall = tp / (tp + fn)

                    if dataset == "aae" or dataset == "hatexplain-race":
                        recall = recall[0]
                        if np.isnan(recall):
                            recall = 0  
                    else:
                        recall = np.min(recall[:-1])

                        if np.isnan(recall):
                            recall = 0  

                    scores.append(recall)
                    
                demographic_samples.append(scores)
            try: 
                csvwriter.writerow([model_name, dataset, shots, demonstration, kruskal(*demographic_samples)])
            except ValueError: 
                csvwriter.writerow([model_name, dataset, shots, demonstration, "failed"])