# Dynamic testing: AAA

### prepare results AAA Files, with gold standard and with result.tsv

In [1]:
import os
import pandas as pd

DATA_PATH = os.path.join(os.getcwd(), 'Adversifier', 'datathon_results')
print(DATA_PATH)
n = 2
teams = [f'team_{i}' for i in range(1,n+1)]

test_names = ['quoting_a_to_n', 'flip_n_to_a', 'corr_a_to_a', 'corr_n_to_n']
other = ['f1_o', 'hashtag_check']

print(teams)

/Users/prl222/OneDrive-University/multidisciplinary-data-challenge/preparation/Adversifier/datathon_results
['team_1', 'team_2']


In [2]:
# Copy submissions
os.mkdir(os.path.join(os.getcwd(), 'Adversifier', 'datathon_results', 'predictions'))

# Submissions with answers are exported to 'answers' folder
os.mkdir(os.path.join(os.getcwd(), 'Adversifier', 'datathon_results', 'answers'))

In [3]:
for team in teams:
    if not os.path.exists(os.path.join(DATA_PATH, 'answers', team)):
        os.mkdir(os.path.join(DATA_PATH, 'answers', team))
    for test in test_names+other:
        res = pd.read_csv(os.path.join(DATA_PATH, 'predictions', team, test+'.tsv'), 
                        sep='\t', 
                        names=['text', 'pred'])
        f = pd.read_csv(os.path.join(os.getcwd(), 'Adversifier', 'mhs', 'aaa_files', test+'.tsv'), 
                        sep='\t', 
                        names=['text', 'gtruth'])
        f['pred'] = res['pred']
        f.to_csv(os.path.join(DATA_PATH, 'answers', team, test+'.tsv'),  
                 sep ='\t', index=False, header=False)

### evaluate answers

In [5]:
# Open docker, to execute terminal eval.py on each folder
os.chdir('Adversifier')
!pwd
!chmod 777 ../eval_variables.sh
!../eval_variables.sh

/Users/prl222/OneDrive-University/multidisciplinary-data-challenge/preparation/Adversifier


In [8]:
# script runs command to generate results.tsv inside each team folder
#!docker run --mount type=bind,source=$ANSWER_FILE_DIR,target=/aaa/output/answer_files aaa python3 eval.py --dataset_name $DATASET_NAME

### display table results

In [9]:
# For each team, get the AAA
results = []
for team in teams:
    res = pd.read_csv(os.path.join(DATA_PATH, 'answers', team, 'results.tsv'),  sep ='\t')
    res['team'] = team
    results.append(res)

results = pd.concat(results, axis=0, ignore_index=True)
# Sort by AAA column
results = results.sort_values(by=['aaa'], ascending=False)
results

Unnamed: 0,f1_o,quoting_a_to_n,corr_n_to_n,flip_n_to_a,corr_a_to_a,aaa,hashtag_check,team
1,49.39,49.06,50.34,49.89,49.65,49.73,50.05,team_2
0,47.21,100.0,100.0,0.0,0.0,0.0,48.0,team_1


# Static testing: HATECHECK

### include all predictions to table

In [11]:
# export hatecheck.csv to answers
os.chdir('../')
H_PATH = 'hatecheck-experiments/Data/Test Suite'
hatecheck_df = pd.read_csv(os.path.join(H_PATH,'hatecheck_final_ACL.csv'), index_col=0)

# load model results
for team in teams: 
    res = pd.read_csv(os.path.join(DATA_PATH, 'predictions', team, 'static.tsv'), 
                      sep='\t', 
                      names=['text', 'pred'])
    res['pred'].replace(to_replace={1:'hateful', 0: 'non-hateful'}, inplace=True)
    hatecheck_df[f'pred_{team}'] = res['pred']

hatecheck_df.to_csv(os.path.join(DATA_PATH, 'answers','hatecheck.csv'))
  

In [12]:
#hatecheck_df = pd.read_csv(os.path.join(DATA_PATH, 'answers','hatecheck.csv'))
hatecheck_df.head()

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ,pred_team_1,pred_team_2
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,1,I hate [IDENTITY_P].,non-hateful,hateful
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,non-hateful
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,1,I hate [IDENTITY_P].,non-hateful,hateful
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful


### evaluate

In [13]:
# compute accuracy
func_accuracy_dict = {}
for m in teams:
    func_accuracy_dict[m] = []
    for func in pd.unique(hatecheck_df.functionality):
        n_cases = hatecheck_df[hatecheck_df.functionality==func].shape[0]
        n_correct = hatecheck_df[(hatecheck_df.functionality==func)&(hatecheck_df['label_gold']==hatecheck_df['pred_{}'.format(m)])].shape[0]
        func_accuracy_dict[m].append('{:.1%}'.format(n_correct/n_cases))
    
    # convert list to series
    func_accuracy_dict[m] = pd.Series(func_accuracy_dict[m])
    func_accuracy_dict[m].name = m

# create df from dict
func_accuracy_df = pd.Series(pd.unique(hatecheck_df.functionality))
func_accuracy_df.name = 'functionality'

for arc_data in func_accuracy_dict:
    func_accuracy_df = pd.concat([func_accuracy_df, pd.Series(func_accuracy_dict[arc_data])], axis =1)
func_accuracy_df  

Unnamed: 0,functionality,team_1,team_2
0,derog_neg_emote_h,50.7%,49.3%
1,derog_neg_attrib_h,46.4%,48.6%
2,derog_dehum_h,53.6%,52.1%
3,derog_impl_h,52.1%,43.6%
4,threat_dir_h,52.6%,51.9%
5,threat_norm_h,48.6%,54.3%
6,slur_h,50.7%,50.0%
7,slur_homonym_nh,56.7%,60.0%
8,slur_reclaimed_nh,55.6%,51.9%
9,profanity_h,56.4%,50.7%
