In [1]:
import pandas as pd
import numpy as np
import re
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
import csv

In [2]:
# Read dataset
df = pd.read_csv('CSVFiles/all_data_with_identities_50000.csv')

In [6]:
# Change the dataframe to only contain the demographic groups we want
newDF = df.loc[:, ["split", "toxicity", "male", "female", "LGBTQ", "christian", "muslim", "other_religions", "black", "white"]]

# Drop all training data
newDF = newDF.drop(newDF[newDF['split'] == 'train'].index)

newDF['male'] = newDF['male'].apply(lambda x: np.round(x >= 0.5))
newDF['female'] = newDF['female'].apply(lambda x: np.round(x >= 0.5))
newDF['LGBTQ'] = newDF['LGBTQ'].apply(lambda x: np.round(x >= 0.5))
newDF['christian'] = newDF['christian'].apply(lambda x: np.round(x >= 0.5))
newDF['muslim'] = newDF['muslim'].apply(lambda x: np.round(x >= 0.5))
newDF['black'] = newDF['black'].apply(lambda x: np.round(x >= 0.5))
newDF['white'] = newDF['white'].apply(lambda x: np.round(x >= 0.5))
newDF['other_religions'] = newDF['other_religions'].apply(lambda x: np.round(x >= 0.5))

In [4]:
# Getting test_data
test_data = newDF[newDF['split'] == 'test']

# Getting validation_data
validation_data = newDF[newDF['split'] == 'val']

# Creating data loaders
test_labels = np.array(test_data['toxicity'].values.tolist())
val_labels = np.array(validation_data['toxicity'].values.tolist())

In [20]:
def getPositivesAndNegativesGroup(group, preds, labels):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    for i in range(len(labels)):
        if labels[i]==1 and preds[i]==1 and group[i] == 1:
            true_positives += 1
        if labels[i]==0 and preds[i]==0 and group[i] == 1:
            true_negatives += 1
        if labels[i]==0 and preds[i]==1 and group[i] == 1:
            false_positives += 1
        if labels[i]==1 and preds[i]==0 and group[i] == 1:
            false_negatives += 1
    
    return true_positives, true_negatives, false_positives, false_negatives

def CheckAccuracy(predictions, labels):
        acc = 0.0
        for i in range(len(predictions)):
            if (predictions[i] == labels[i]):
                acc += 1
        return acc/len(predictions)
    
def checkAccDemoGroup(group, preds, toxicArray):
    assert len(group) == len(preds) and len(group) == len(toxicArray)
    true_positives, true_negatives, false_positives, false_negatives = getPositivesAndNegativesGroup(group, preds, toxicArray)
    total_toxic = true_positives+false_negatives
    total_nonToxic = true_negatives+false_positives
    acc_toxic = true_positives/total_toxic
    acc_nonToxic = true_negatives/total_nonToxic
    return round(acc_toxic, 3), round(acc_nonToxic, 3), total_toxic, total_nonToxic

def checkF1DemoGroup(group, preds, toxicArray):
    assert len(group) == len(preds) and len(group) == len(toxicArray)
    true_positives, true_negatives, false_positives, false_negatives = getPositivesAndNegativesGroup(group, preds, toxicArray)
    total_group = true_positives+false_negatives+true_negatives+false_positives
    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/(true_positives + false_negatives)
    
    f1_score = 2*(precision*recall)/(precision+recall)
    return round(f1_score, 3), total_group

In [21]:
def loadTestAndValPreds(seed, pathPrefix):
    test_preds = []
    val_preds = []
    test_path = pathPrefix+'test_seed-'+ str(seed) + '_epoch-best_pred.csv'
    val_path = pathPrefix+'val_seed-'+ str(seed) + '_epoch-best_pred.csv'
    
    with open(test_path, 'r') as read_obj:
        csv_reader = csv.reader(read_obj)
        for row in csv_reader:
            test_preds.append(int(row[0]))

    with open(val_path, 'r') as read_obj:
        csv_reader = csv.reader(read_obj)
        for row in csv_reader:
            val_preds.append(int(row[0]))

    test_preds = np.array(test_preds)
    val_preds = np.array(val_preds)
    
    return test_preds, val_preds

In [8]:
def printEvalResults(pathPrefix):
    avg_accuracies = []
    avg_toxic_accuracies = []
    avg_nonToxic_accuracies = []
    global_worst_acc = 1
    global_worst_group_seed_acc = ''
    avg_f1Scores = []
    global_worst_f1 = 1
    global_worst_group_seed_f1 = ''
    toxicArray = np.array(test_data['toxicity'].values.tolist())
    #seeds = [1,5,10,20,30,40,50,60,70,80,90]
    #seeds = [100,200,300,400,500,600,700,800,900,1000]
    for i in range(1,11):
        print('\n---------------- Seed ' + str(i) + '---------------\n')
        test_preds, val_preds = loadTestAndValPreds(i, pathPrefix)
        worst_acc = 1
        worst_f1 = 1
        worst_group_acc = ''
        worst_group_f1 = ''
        toxic_accuracies = []
        nonToxic_accuracies = []
        overall_toxic = 0
        overall_nonToxic = 0

        # Go through all but the first two columns and check accuracy
        for curCol in test_data.iloc[:,2:]:
            curColArray = np.array(test_data[curCol].values.tolist())
            cur_f1, total_group = checkF1DemoGroup(curColArray, test_preds, toxicArray)
            acc_toxic, acc_nonToxic, total_toxic, total_nonToxic = checkAccDemoGroup(curColArray, test_preds, toxicArray)
            if curCol == 'christian':
                print(f'{curCol} \tacc on non_toxic: {acc_nonToxic:.3f} ({total_nonToxic}) \tacc on toxic: {acc_toxic:.3f} ({total_toxic})\tf1_score: {cur_f1:.3f} ({total_group})')
            elif curCol == 'other_religions':
                print(f'{curCol} acc on non_toxic: {acc_nonToxic:.3f} ({total_nonToxic})  \tacc on toxic: {acc_toxic:.3f} ({total_toxic})\tf1_score: {cur_f1:.3f} ({total_group})')
            else:
                print(f'{curCol} \t\tacc on non_toxic: {acc_nonToxic:.3f} ({total_nonToxic})  \tacc on toxic: {acc_toxic:.3f} ({total_toxic})\tf1_score: {cur_f1:.3f} ({total_group})')
            if worst_acc > acc_toxic:
                worst_acc = acc_toxic
                worst_group_acc = curCol
            elif worst_acc > acc_nonToxic:
                worst_acc = acc_nonToxic
                worst_group_acc = curCol
            if worst_f1 > cur_f1:
                worst_f1 = cur_f1
                worst_group_f1 = curCol
            if global_worst_f1 > worst_f1:
                    global_worst_f1 = worst_f1
                    global_worst_group_seed_f1 = f' for demographic group {curCol} in seed {i}'
            if global_worst_acc > worst_acc:
                    global_worst_acc = worst_acc
                    global_worst_group_seed_acc = f' for demographic group {curCol} in seed {i}'
            toxic_accuracies.append(acc_toxic*total_toxic)
            nonToxic_accuracies.append(acc_nonToxic*total_nonToxic)
            overall_toxic += total_toxic
            overall_nonToxic += total_nonToxic
        avgSeedAcc = CheckAccuracy(test_preds, toxicArray)
        avgSeedF1 = f1_score(toxicArray, test_preds, zero_division=1)
        avgSeedAccToxic = sum(toxic_accuracies)/overall_toxic
        avgSeedAccNonToxic = sum(nonToxic_accuracies)/overall_nonToxic
        print(f'Average accuracy on non toxic for seed is: {avgSeedAccNonToxic:.3f}')
        print(f'Average accuracy on toxic for seed is: {avgSeedAccToxic:.3f}')
        print(f'Average accuracy for seed is: {avgSeedAcc:.3f}')
        print(f'Worst accuracy is {worst_acc:.3f} for demographic group {worst_group_acc}')
        print(f'Average f1 for seed is: {avgSeedF1:.3f}')
        print(f'Worst f1 is {worst_f1:.3f} for demographic group {worst_group_f1}')
        avg_accuracies.append(avgSeedAcc)
        avg_f1Scores.append(avgSeedF1)
        avg_nonToxic_accuracies.append(avgSeedAccNonToxic)
        avg_toxic_accuracies.append(avgSeedAccToxic)
    overall_toxic_acc = sum(avg_toxic_accuracies)/len(avg_toxic_accuracies)
    overall_nonToxic_acc = sum(avg_nonToxic_accuracies)/len(avg_nonToxic_accuracies)
    overall_avg_acc = sum(avg_accuracies)/len(avg_accuracies)
    overall_avg_f1 = sum(avg_f1Scores)/len(avg_f1Scores)

    print('\n---------------- Overall evaluation---------------\n')
    print(f'Overall non toxic acc. is: {overall_nonToxic_acc:.3f}')
    print(f'Overall toxic acc. is: {overall_toxic_acc:.3f}')
    print(f'Overall avg. acc. is: {overall_avg_acc:.3f}')
    print(f'Overall worst acc. is: {global_worst_acc:.3f}{global_worst_group_seed_acc}')
    print(f'Overall avg. f1. is: {overall_avg_f1:.3f}')
    print(f'Overall worst f1. is: {global_worst_f1:.3f}{global_worst_group_seed_f1}')

In [9]:
printEvalResults('finalOwnWilds/logs/50000noDP/civilcomments_split-')


---------------- Seed 1---------------

male 		acc on non_toxic: 0.930 (86)  	acc on toxic: 0.626 (254)	f1_score: 0.759 (340)
female 		acc on non_toxic: 0.910 (122)  	acc on toxic: 0.600 (280)	f1_score: 0.732 (402)
LGBTQ 		acc on non_toxic: 1.000 (7)  	acc on toxic: 0.698 (162)	f1_score: 0.822 (169)
christian 	acc on non_toxic: 0.962 (78) 	acc on toxic: 0.623 (159)	f1_score: 0.759 (237)
muslim 		acc on non_toxic: 0.892 (37)  	acc on toxic: 0.737 (190)	f1_score: 0.838 (227)
other_religions acc on non_toxic: 0.824 (34)  	acc on toxic: 0.662 (68)	f1_score: 0.756 (102)
black 		acc on non_toxic: 0.600 (5)  	acc on toxic: 0.703 (175)	f1_score: 0.820 (180)
white 		acc on non_toxic: 0.667 (21)  	acc on toxic: 0.655 (267)	f1_score: 0.780 (288)
Average accuracy on non toxic for seed is: 0.900
Average accuracy on toxic for seed is: 0.657
Average accuracy for seed is: 0.786
Worst accuracy is 0.600 for demographic group female
Average f1 for seed is: 0.740
Worst f1 is 0.732 for demographic group f

muslim 		acc on non_toxic: 0.865 (37)  	acc on toxic: 0.758 (190)	f1_score: 0.850 (227)
other_religions acc on non_toxic: 0.794 (34)  	acc on toxic: 0.676 (68)	f1_score: 0.760 (102)
black 		acc on non_toxic: 0.600 (5)  	acc on toxic: 0.657 (175)	f1_score: 0.788 (180)
white 		acc on non_toxic: 0.762 (21)  	acc on toxic: 0.610 (267)	f1_score: 0.749 (288)
Average accuracy on non toxic for seed is: 0.900
Average accuracy on toxic for seed is: 0.654
Average accuracy for seed is: 0.791
Worst accuracy is 0.600 for demographic group black
Average f1 for seed is: 0.748
Worst f1 is 0.747 for demographic group female

---------------- Overall evaluation---------------

Overall non toxic acc. is: 0.888
Overall toxic acc. is: 0.666
Overall avg. acc. is: 0.793
Overall worst acc. is: 0.400 for demographic group black in seed 4
Overall avg. f1. is: 0.751
Overall worst f1. is: 0.712 for demographic group other_religions in seed 8


In [10]:
def getEvalResults(preds, labels, data):
    worst_acc = 1
    worst_f1 = 1
    worst_group_acc = ''
    worst_group_f1 = ''
    toxic_accuracies = []
    nonToxic_accuracies = []
    overall_toxic = 0
    overall_nonToxic = 0

    # Go through all but the first two columns and check accuracy
    for curCol in data.iloc[:,2:]:
        curColArray = np.array(data[curCol].values.tolist())
        cur_f1, total_group = checkF1DemoGroup(curColArray, preds, labels)
        acc_toxic, acc_nonToxic, total_toxic, total_nonToxic = checkAccDemoGroup(curColArray, preds, labels)
        if worst_acc > acc_toxic:
            worst_acc = acc_toxic
            worst_group_acc = curCol
        elif worst_acc > acc_nonToxic:
            worst_acc = acc_nonToxic
            worst_group_acc = curCol
        if worst_f1 > cur_f1:
            worst_f1 = cur_f1
            worst_group_f1 = curCol
        toxic_accuracies.append(acc_toxic*total_toxic)
        nonToxic_accuracies.append(acc_nonToxic*total_nonToxic)
        overall_toxic += total_toxic
        overall_nonToxic += total_nonToxic
    avgAcc = CheckAccuracy(preds, labels)
    avgF1 = f1_score(labels, preds, zero_division=1)
    avgAccToxic = sum(toxic_accuracies)/overall_toxic
    avgAccNonToxic = sum(nonToxic_accuracies)/overall_nonToxic
    
    return avgAcc, avgF1, avgAccToxic, avgAccNonToxic, worst_group_acc, worst_group_f1, worst_acc, worst_f1

In [11]:
def saveNoDPEvalResults(pathPrefix):
    val_labels = np.array(validation_data['toxicity'].values.tolist())
    test_labels = np.array(test_data['toxicity'].values.tolist())
    
    with open('CSVFiles/50000noDPEval.csv','w',newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Seed','n_epochs','Epsilon','Loss function','Optimizer','Learning rate','Batch_size',
                         'F1-score test','F1-score test worst','F1-score val','F1-score val worst',
                         'Acc toxic test','Acc non-toxic test','Overall acc test','Worst group acc test',
                         'Acc toxic val','Acc non-toxic val','Overall acc val','Worst group acc val'])
    
        for i in range(1, 11):
            test_preds, val_preds = loadTestAndValPreds(i, pathPrefix)
            avgSeedAcc_val, avgSeedF1_val, avgSeedAccToxic_val, avgSeedAccNonToxic_val, worst_group_acc_val, worst_group_f1_val, worst_acc_val, worst_f1_val = getEvalResults(val_preds, val_labels, validation_data)
            avgSeedAcc_test, avgSeedF1_test, avgSeedAccToxic_test, avgSeedAccNonToxic_test, worst_group_acc_test, worst_group_f1_test, worst_acc_test, worst_f1_test = getEvalResults(test_preds, test_labels, test_data)
            
            writer.writerow([i, 20, 0, 'Cross-entropy', 'AdamW', 1.00E-05,16,
                             avgSeedF1_test,f'{worst_f1_test:.3f} ({worst_group_f1_test})',avgSeedF1_val,f'{worst_f1_val:.3f} ({worst_group_f1_val})',
                             avgSeedAccToxic_test,avgSeedAccNonToxic_test,avgSeedAcc_test,f'{worst_acc_test:.3f} ({worst_group_acc_test})',
                             avgSeedAccToxic_val,avgSeedAccNonToxic_val,avgSeedAcc_val,f'{worst_acc_val:.3f} ({worst_group_acc_val})'])
            

In [19]:
def saveDPEvalResults(pathPrefix):
    val_labels = np.array(validation_data['toxicity'].values.tolist())
    test_labels = np.array(test_data['toxicity'].values.tolist())
    
    with open('CSVFiles/50000DPEvalEvery10.csv','w',newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Seed','n_epochs','Loss function','Optimizer','Learning rate','Batch Size',
                         'Virtual batch size','Target eps','Target delta','Sample rate','Dp max grad norm',
                         'F1-score test','F1-score test worst','F1-score val','F1-score val worst',
                         'Acc toxic test','Acc non-toxic test','Overall acc test','Worst group acc test',
                         'Acc toxic val','Acc non-toxic val','Overall acc val','Worst group acc val'])
        seeds = [1,5,10,20,30,40,50,60,70,80,90]
        #seeds = [100,200,300,400,500,600,700,800,900,1000]
        
        for i in seeds:
            test_preds, val_preds = loadTestAndValPreds(i, pathPrefix)
            avgSeedAcc_val, avgSeedF1_val, avgSeedAccToxic_val, avgSeedAccNonToxic_val, worst_group_acc_val, worst_group_f1_val, worst_acc_val, worst_f1_val = getEvalResults(val_preds, val_labels, validation_data)
            avgSeedAcc_test, avgSeedF1_test, avgSeedAccToxic_test, avgSeedAccNonToxic_test, worst_group_acc_test, worst_group_f1_test, worst_acc_test, worst_f1_test = getEvalResults(test_preds, test_labels, test_data)
            
            writer.writerow([i, 20, 'Cross-entropy', 'AdamW', 1.00E-05, 8,
                             16, i, 1e-05, 0.01, 1.2,
                             avgSeedF1_test,f'{worst_f1_test:.3f} ({worst_group_f1_test})',avgSeedF1_val,f'{worst_f1_val:.3f} ({worst_group_f1_val})',
                             avgSeedAccToxic_test,avgSeedAccNonToxic_test,avgSeedAcc_test,f'{worst_acc_test:.3f} ({worst_group_acc_test})',
                             avgSeedAccToxic_val,avgSeedAccNonToxic_val,avgSeedAcc_val,f'{worst_acc_val:.3f} ({worst_group_acc_val})'])

In [21]:
saveNoDPEvalResults('finalOwnWilds/logs/50000noDP/civilcomments_split-')

In [18]:
saveDPEvalResults('finalOwnWilds/logs/50000Every100/civilcomments_split-')