In [None]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [None]:
import numpy as np, random
import pandas as pd 
from travis_attack.config import Config
from travis_attack.insights import get_training_dfs
from travis_attack.utils import display_all

seed = 100
np.random.seed(seed)

In [None]:
## Run lists 
cfg = Config()
df_runs = pd.read_csv(f"{cfg.path_results}run_results.csv")
df_runs1 = df_runs[255:]  # filter which rows you want by eyeballing (usually these are part of a sweep)
df_config = df_runs1[['run_name','seed', 'dataset_name', 'gen_params_train.temperature', 'decode_method_eval']].drop_duplicates()
df_config  = df_config.groupby(['dataset_name', 'gen_params_train.temperature', 'decode_method_eval'])['run_name']\
                      .apply(list).to_frame('run_names').reset_index()
run_lists = df_config['run_names'].to_list()

In [None]:
def load_and_process_run_results(cfg, run_name, split="test"):
    # Load results
    df = get_training_dfs(f"{cfg.path_checkpoints}{run_name}/", postprocessed=False)[split]
    if split != "test": 
        min_epoch,max_epoch = min(df['epoch']),max(df['epoch'])
        df = df.query("epoch==@min_epoch or epoch==@max_epoch")
    # Agg to one row per orig example
    df_any_adv_example = df.groupby(['idx', 'epoch'])['is_adv_example'].agg('sum').apply(lambda x: (x > 0)*1).reset_index()
    # Setup baseline and trained conditions and pivot to get results
    df_any_adv_example['condition'] = df_any_adv_example['epoch'].apply(lambda x: "baseline" if x==0 else "trained") 
    df_any_adv_example.drop(columns='epoch', inplace=True)
    df_wide = df_any_adv_example.pivot(index='idx',  columns=['condition'])
    df_wide.columns = ["_".join(a) for a in df_wide.columns.to_flat_index()]
    df_wide.index = [f"{run_name}_{c}" for c in df_wide.index]
    return df_wide

df_l = []
run_list = run_lists[3]
split="train"
for run_name in run_list:
    df_l.append(load_and_process_run_results(cfg, run_name, split=split))
df_examples = pd.concat(df_l)
df_examples

Unnamed: 0,reward_pp_baseline,reward_pp_trained
polar-sweep-11_0,1,1
polar-sweep-11_150,0,0
polar-sweep-11_300,0,0
polar-sweep-11_450,1,1
polar-sweep-11_500,1,1
polar-sweep-11_900,1,1
polar-sweep-11_950,1,1
polar-sweep-11_1100,0,1
polar-sweep-11_1350,1,1
polar-sweep-11_1400,0,0


In [None]:
# Bootstrap 
b = 10000
def bootstrap_from_pd(df, b, A_col, B_col): 
    diff_overall =  df[A_col].sum() - df[B_col].sum()
    diff_l = []
    for i in range(b): 
        df_sample = df.sample(n=df_wide.shape[0], replace=True)
        diff_l.append(df_sample[A_col].sum() - df_sample[B_col].sum())
    print(diff_overall)
    print(diff_l)
    condition_met = sum([ (diff_sample > (2 * diff_overall))*1  for diff_sample in diff_l])
    p_val = condition_met/b      
    return p_val

bootstrap_from_pd(df_examples, b=b, A_col="is_adv_example_trained", B_col = "is_adv_example_baseline")

2
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

0.0

In [None]:
condition_met

0

In [None]:
def bootstrap(set_X, b): #returns p-value(x)
    d_X = np.sum(list(zip(*set_X))[0]) - np.sum(list(zip(*set_X))[1]) # how much better does algorithm A do than B on x
    d_X_1tob = [] 
    for i in range(0, b):
        A1_b, B1_b = (0, 0)
        # Draw a bootstrap sample x(i) of size n
        for j in range(len(set_X)):
            #Select a member of x at random and add it to x(i)
            set_Xb = random.choice(set_X) 
            A1_b += set_Xb[0]
            B1_b += set_Xb[1]
        d_X_1tob.append(A1_b - B1_b)  #delta: how much better does algorithm A do than B on x(i)

    #Count the samples on which algorithm A accidentally did better than B
    s = 0  
    for dx in d_X_1tob:
        if dx > (2 * d_X):
            s += 1    

    #onesided empirical p-value 
    p_val = s/b      
    return p_val

In [None]:
bstrap_input = [(1, 1),
 (1, 1),
 (0, 0),
 (0, 0)]
print(bootstrap(bstrap_input, b=100))

0.0


In [None]:
bstrap_input

[(1, 0),
 (1, 0),
 (0, 0),
 (1, 0),
 (1, 0),
 (1, 1),
 (1, 1),
 (0, 1),
 (1, 1),
 (1, 0)]