In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Parsing ICL

In [None]:
dataset='mnli'
model_name='meta-llama/Llama-2-7b-hf'

results_path = f'./checkpoints/{dataset}/{model_name}/textbugger/'

# attacks = ['swap_labels', 'swap_labels_fix_dist']
attacks = ['swap_labels']
seeds = [1, 13, 42]
shots = [2, 4, 8, 16]
results = []

results_path = f'./checkpoints/{dataset}/{model_name}/'
for seed in seeds:
    for attack in attacks:
        for shot in shots:
            file = f'{results_path}{attack}/icl_attack-seed-{seed}-shot-{shot}/logs_{attack}.txt'
            if os.path.isfile(file):
                with open(file) as f:
                    lines = f.readlines()
                    result = {'seed': seed, 'shot': shot, 'attack': attack, 'method': 'icl'}
                    for line in lines:
                        if 'Original accuracy:' in line:
                            result['Clean Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Accuracy under attack:' in line:
                            result['Attack Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Attack success rate:' in line:
                            result['ASR'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Average perturbed word %:' in line:
                            result['Perturbed Word Ratio'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Average Attack Perplexity:' in line:
                            result['Attack Perplexity'] = float(line.split(':')[1].strip().strip('| '))

                    results.append(result)
            else:
                print(f'File {file} not found')

In [None]:
icl_df = pd.DataFrame(results)
# drop the method column
icl_df = icl_df.drop(columns=['method'])

icl_df_mean = icl_df.groupby(['shot', 'attack']).mean().round(2)
icl_df_std = icl_df.groupby(['shot', 'attack']).std().round(2)

# combine the mean and std into a single dataframe with plus minus sign
icl_df_mean_std = icl_df_mean.astype(str) + ' ± ' + icl_df_std.astype(str)
icl_df_mean_std

### Parsing kNN-ICL

In [None]:
dataset='mr'
model_name='meta-llama/Llama-2-7b-hf'

results_path = f'./checkpoints/{dataset}/{model_name}/'

attacks = ['irrelevant_sample']
seeds = [1, 13, 42]
shots = [2, 4, 8, 16]
results = []

for seed in seeds:
    for attack in attacks:
        for shot in shots:
            file = f'{results_path}{attack}/knn_icl-seed-{seed}-shot-{shot}/logs_{attack}.txt'
            if os.path.isfile(file):
                with open(file) as f:
                    lines = f.readlines()
                    result = {'seed': seed, 'shot': shot, 'attack': attack}
                    for line in lines:
                        if 'Original accuracy:' in line:
                            result['Clean Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Accuracy under attack:' in line:
                            result['Attack Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Attack success rate:' in line:
                            result['ASR'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Average perturbed word %:' in line:
                            result['Perturbed Word Ratio'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Average Attack Perplexity:' in line:
                            result['Attack Perplexity'] = float(line.split(':')[1].strip().strip('| '))

                    results.append(result)

In [None]:
knn_df = pd.DataFrame(results)
# compute the mean and std for Clean Acc, Attack Acc, ASR, Perturbed Word Ratio, Attack Perplexity for knn_df
knn_df_mean = knn_df.groupby(['shot', 'attack']).mean().round(2)
knn_df_std = knn_df.groupby(['shot', 'attack']).std().round(2)

# combine the mean and std into a single dataframe with plus minus sign
knn_df_mean_std = knn_df_mean.astype(str) + ' ± ' + knn_df_std.astype(str)
knn_df_mean_std

### Parsing Retriever

In [None]:
dataset='mnli'
model_name='meta-llama/Llama-2-7b-hf'

retrievers = ['bm25', 'sbert', 'instructor']


attacks = ['textfooler', 'textbugger', 'bert_attack']
seed = 1
shots = [2, 4, 8, 16]
results = []

results_path = f'./checkpoints/{dataset}/{model_name}/'

for ret in retrievers:
    for attack in attacks:
        for shot in shots:
            file = f'{results_path}{attack}/retrieval_icl-seed-{seed}-shot-{shot}/logs_{attack}_{ret}.txt'
            if os.path.isfile(file):
                with open(file) as f:
                    lines = f.readlines()
                    result = {'shot': shot, 'attack': attack, 'ret': ret}
                    for line in lines:
                        if 'Original accuracy:' in line:
                            result['Clean Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Accuracy under attack:' in line:
                            result['Attack Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Attack success rate:' in line:
                            result['ASR'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Average perturbed word %:' in line:
                            result['Perturbed Word Ratio'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                        elif 'Average Attack Perplexity:' in line:
                            result['Attack Perplexity'] = float(line.split(':')[1].strip().strip('| '))
            
                    results.append(result)
            else:
                print(file)

In [None]:
pd.DataFrame(results)

In [None]:
# Compute the mean and std for Clean Acc, Attack Acc, ASR, Perturbed Word Ratio, Attack Perplexity for icl_df

icl_df_mean = icl_df.groupby(['shot', 'attack']).mean().round(2)
icl_df_std = icl_df.groupby(['shot', 'attack']).std().round(2)

# combine the mean and std into a single dataframe with plus minus sign
icl_df_mean_std = icl_df_mean.astype(str) + ' ± ' + icl_df_std.astype(str)
icl_df_mean_std

### kNN Analysis

In [None]:
dataset='sst2'
model_name='meta-llama/Llama-2-7b-hf'

results_path = f'./checkpoints/{dataset}/{model_name}/textfooler/'

# attacks = ['textbugger', 'textfooler']
attacks = ['textfooler']
ks = [1, 3, 7, 15, 31, 63, 127, 255, 511]
results = []
for seed in [1]:
    for shot in [16, 32, 64, 128, 256, 512]:
        for k in [x for x in ks if x < shot]:
            for attack in attacks:
                try:
                    file = f'{results_path}knn_icl-seed-{seed}-shot-{shot}/example-k-{k}/logs_{attack}_test.txt'
                    if os.path.isfile(file) == False:
                        file = f'{results_path}knn_icl-seed-{seed}-shot-{shot}/example-k-{k}/logs_{attack}_0.15_test.txt'
                    """
                    locate and parse the following in the file 
                    +-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 534    |
| Number of failed attacks:     | 178    |
| Number of skipped attacks:    | 160    |
| Original accuracy:            | 81.65% |
| Accuracy under attack:        | 20.41% |
| Attack success rate:          | 75.0%  |
| Average perturbed word %:     | 8.36%  |
| Average num. words per input: | 17.4   |
| Avg num queries:              | 74.37  |
| Adv confidence:               | 1.0    |
| Average Original Perplexity:  | 61.62  |
| Average Attack Perplexity:    | 86.19  |
| Average Attack USE Score:     | 0.88   |
+-------------------------------+--------+
                    """ 
                    with open(file, 'r') as f:
                        lines = f.readlines()
                except:
                    print(f'file {file} not found')
                    continue

                result = {'seed': seed, 'shot': shot, 'k': k, 'attack': attack}
                for line in lines:
                    if 'Original accuracy:' in line:
                        result['Clean Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                    elif 'Accuracy under attack:' in line:
                        result['Attack Acc'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                    elif 'Attack success rate:' in line:
                        result['ASR'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                    elif 'Average perturbed word %:' in line:
                        result['Perturbed Word Ratio'] = float(line.split(':')[1].strip().replace('%', '').strip('| '))
                    elif 'Average Attack Perplexity:' in line:
                        result['Attack Perplexity'] = float(line.split(':')[1].strip().strip('| '))

                results.append(result)


In [None]:
df = pd.DataFrame(results)
df.head()

In [None]:
df.to_csv(f'{results_path}results.csv', index=False)

In [None]:
textfooler = df[df['attack'] == 'textfooler']

textfooler = textfooler[['shot', 'k', 'Perturbed Word Ratio']]
# keep unique row
textfooler = textfooler.drop_duplicates()

ks = [1, 3, 7, 15, 31, 63, 127, 255]
# make k as string and categorical
textfooler['k'] = textfooler['k'].astype(str)
textfooler['k'] = textfooler['k'].astype('category')

color_palette = ['#311A3C', '#672555', '#9F2958', '#CF4448', '#E37D5A', '#ECB795']
color_palette = sns.color_palette("rocket")
# sort the k column
textfooler['k'] = textfooler['k'].cat.reorder_categories([str(k) for k in ks], ordered=True)
# plot line graph with k as the x-axis, Clean Acc as the y-axis and shot as the hue

# grid
sns.set_style("darkgrid")

sns.lineplot(data=textfooler, x='k', y='Perturbed Word Ratio', hue='shot', palette=color_palette, ci=None, linewidth=2)
# legend
plt.legend(loc='lower right', title='Shot', fontsize=12)
# title "Clean Acc vs k"
plt.title('Perturbed Word Ratio (TextFooler)')
# rescale x-axis as categorical

textfooler.head()

In [None]:
textfooler = df[df['attack'] == 'textfooler']

textfooler = textfooler[['shot', 'k', 'Attack Perplexity']]
# keep unique row
textfooler = textfooler.drop_duplicates()

ks = [1, 3, 7, 15, 31, 63, 127, 255]
# make k as string and categorical
textfooler['k'] = textfooler['k'].astype(str)
textfooler['k'] = textfooler['k'].astype('category')

color_palette = ['#311A3C', '#672555', '#9F2958', '#CF4448', '#E37D5A', '#ECB795']
color_palette = sns.color_palette("rocket")
# sort the k column
textfooler['k'] = textfooler['k'].cat.reorder_categories([str(k) for k in ks], ordered=True)
# plot line graph with k as the x-axis, Clean Acc as the y-axis and shot as the hue

# grid
sns.set_style("darkgrid")

sns.lineplot(data=textfooler, x='k', y='Attack Perplexity', hue='shot', palette=color_palette, ci=None, linewidth=2)
# legend
plt.legend(loc='lower right', title='Shot', fontsize=12)
# title "Clean Acc vs k"
plt.title('Attack Perplexity (TextFooler)')
# rescale x-axis as categorical

textfooler.head()