In [37]:
import pandas as pd
import numpy as np
import os

In [60]:
models = [
    'Llama-2-7b-hf_attack_results.csv',
    'Llama-2-13b-hf_attack_results.csv',
    'Mistral-7B-v0.1_attack_results.csv',
    'Mixtral-8x7B-v0.1_attack_results.csv',
    'gemma-7b_attack_results.csv',
    'vicuna-7b-v1.5_attack_results.csv'
]
seeds = [1,13,42]
retrievers = ['bm25', 'sbert', 'instructor']
attacks = ['swap_labels']

In [61]:
def get_metrics(df):
    clean_acc = df['correct'].value_counts()[True] / df['correct'].value_counts().sum()
    attack_acc = df['attack_correct'].value_counts()[True] / df['attack_correct'].value_counts().sum()
    asr = (clean_acc - attack_acc) / clean_acc
    return {
        'clean_acc': clean_acc,
        'attack_acc': attack_acc,
        'asr': asr
    }

In [62]:
results = []

for model in models:
    for seed in seeds:
        for attack in attacks:
            model_name = 'icl' if attack in ['textfooler', 'textbugger', 'bert_attack'] else 'icl_attack'
            path = f'./meta-llama/Llama-2-7b-hf/{attack}/{model_name}-seed-{seed}-shot-8/{model}'
            if not os.path.exists(path):
                continue
            df = pd.read_csv(f'./meta-llama/Llama-2-7b-hf/{attack}/{model_name}-seed-{seed}-shot-8/{model}')
            metrics = get_metrics(df)
            results.append({
                'model': model,
                'seed': seed,
                'attack': attack,
                **metrics
            })


In [63]:
results_df = pd.DataFrame(results)
# take the mean 
# results_df = results_df.groupby('model').mean().reset_index()
results_df
# save as json
# results_df.to_json('rte-meta-llama-13b-icl-swap-labels-fix-dist.json', orient='records')

Unnamed: 0,model,seed,attack,clean_acc,attack_acc,asr
0,Llama-2-7b-hf_attack_results.csv,1,swap_labels,0.714801,0.245487,0.656566


In [32]:
results = []
for model in models:
    for r in retrievers:
        df = pd.read_csv(f'./meta-llama/Llama-2-13b-hf/swap_labels/retrieval_icl-seed-1-shot-8_{r}_fix_dist/{model}')
        metrics = get_metrics(df)
        results.append({
            'model': model,
            'seed': seed,
            **metrics
        })


In [33]:
results_df = pd.DataFrame(results)
# take the mean 
results_df = results_df.groupby('model').mean().reset_index()
results_df.to_json('rte-meta-llama-13b-ricl-swap-labels-fix-dist.json', orient='records')
results_df

Unnamed: 0,model,seed,clean_acc,attack_acc,asr
0,Llama-2-13b-hf_attack_results.csv,42.0,0.747292,0.465704,0.379789
1,Llama-2-7b-hf_attack_results.csv,42.0,0.717208,0.599278,0.164324
2,Mistral-7B-v0.1_attack_results.csv,42.0,0.776173,0.620939,0.200093
3,Mixtral-8x7B-v0.1_attack_results.csv,42.0,0.77497,0.647413,0.164685
4,gemma-7b_attack_results.csv,42.0,0.76414,0.555957,0.27399
5,vicuna-7b-v1.5_attack_results.csv,42.0,0.772563,0.726835,0.059155
