In [1]:
import os 
import json
from collections import defaultdict
import numpy as np

In [2]:
model_names = [
    'backpack-gpt2',
    'pythia-70m',
    'pythia-160m',
    'pythia-410m',
    'pythia-1b',
    'pythia-1.4b',
    'pythia-2.8b',
    'pythia-6.9b'
]
dnames = [
    'company', 
    'country', 
    'verbs', 
    'temporal', 
    'stereoset', 
    'gender'    
]
leagues = [1e-3, 1e-4, 1e-5]
subject_types = ['true', 'prefix']

In [3]:
fnames = []
for root, dirs, files in os.walk("sbatches_100723/test_scripts"):
    for fname in files:
        if 'noedit' not in fname:
            fnames.append(fname)
len(fnames)

258

In [4]:
exps = []
for fname in fnames:
    vals = fname[:-7].split('_')
    exps.append(vals[0] + '__' + vals[1] +'_' + vals[2] + '__' + vals[3] )
len(exps)

258

In [5]:
# load test results
results_dir = "log_memit_100723_test_results"
test_results = defaultdict(list)
for root, dirs, files in os.walk(results_dir):
    for fname in files:
        if 'noedit' in fname:
            continue
        vals = fname[:-5].split('__')
        exp_id = '__'.join(vals[:-1])
    
        with open(os.path.join(results_dir, fname), 'r') as fh:
            data = json.load(fh)
            test_results[exp_id].append(data)

for k in sorted(test_results.keys()):
    if len(test_results[k]) != 5:
        print("Warning: did not find 5 runs for", k, len(test_results[k]))


In [6]:
for k in sorted(exps):
    assert k in test_results.keys()
for k in test_results.keys():
    assert k in exps

In [7]:
len(test_results.keys())

258

In [8]:
from make_sweep import model_name_to_short

In [9]:
def get_test_results(model_name, league, dname, subject_type, verbose=False):
    exp_id = f'{model_name_to_short(model_name)}__{dname}-{subject_type}_subject__{league}'

    # get rid of runs that are out-of-league
    options = []
    for exp_run in test_results[exp_id]:
        league_cutoff = exp_run['noedit']['general_score'] * (1+league)
        if exp_run['edit']['general_score'] < league_cutoff:
            options.append(exp_run)
    if verbose:
        print(f"{exp_id} has {len(test_results[exp_id])} entries, of which {len(options)} are in-league")

    general_scores = [exp_run['edit']['general_score'] for exp_run in options]
    intervention_scores = [exp_run['edit']['intervention_score'] for exp_run in options]
    hard_negative_scores = [exp_run['edit']['hard_negative_score'] for exp_run in options]

    baseline_intervention = [exp_run['noedit']['intervention_score'] for exp_run in options]
    baseline_hard_negative = [exp_run['noedit']['hard_negative_score'] for exp_run in options]

    success_rate_change = np.array(baseline_intervention) - np.array(intervention_scores)
    hard_negative_score_change = np.array(hard_negative_scores) - np.array(baseline_hard_negative)
    return {
        'intervention_score': {
            'mean': np.mean(intervention_scores),
            'stdv': np.std(intervention_scores),
        },
        'success_rate_change': {
            'mean': np.mean(success_rate_change),
            'stdv': np.std(success_rate_change),
            'full_baseline_intervention': baseline_intervention,
            'full_intervention_scores': intervention_scores,
        },
        'hard_negative_score': {
            'mean': np.mean(hard_negative_scores),
            'stdv': np.std(hard_negative_scores),
        },
        'hard_negative_score_change': {
            'mean': np.mean(hard_negative_score_change),
            'stdv': np.std(hard_negative_score_change),
        },
        'n': len(general_scores),
    }


model_name = 'pythia-1.4b' # 'pythia-410m'
league = 1e-3
dname = 'gender' # 'country'
subject_type = 'prefix'

get_test_results(model_name, league, dname, subject_type)

{'intervention_score': {'mean': 0.8188888888888888,
  'stdv': 0.013310165056345523},
 'success_rate_change': {'mean': 0.05611111111111111,
  'stdv': 0.013310165056345523,
  'full_baseline_intervention': [0.875, 0.875, 0.875, 0.875, 0.875],
  'full_intervention_scores': [0.8305555555555556,
   0.8305555555555556,
   0.7944444444444444,
   0.8166666666666667,
   0.8222222222222222]},
 'hard_negative_score': {'mean': 1.923723404255319,
  'stdv': 0.003681935079547429},
 'hard_negative_score_change': {'mean': 0.01153922872340427,
  'stdv': 0.003681935079547429},
 'n': 5}

In [10]:
results = {}
for subject_type in subject_types:
    results[subject_type] = {}
    for model_name in model_names:
        results[subject_type][model_name] = {}
        for dname in dnames:
            results[subject_type][model_name][dname] = {}
            for league in leagues:
                results[subject_type][model_name][dname][league] = get_test_results(model_name, league, dname, subject_type)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [11]:
results['oracle'] = results['true']
del results['true']

In [12]:
model_name = 'pythia-6.9b'
dname = 'gender'
league = 1e-4
results['oracle'][model_name][dname][league]

{'intervention_score': {'mean': 0.5194444444444445,
  'stdv': 0.002777777777777768},
 'success_rate_change': {'mean': 0.3833333333333333,
  'stdv': 0.002777777777777768,
  'full_baseline_intervention': [0.9027777777777778, 0.9027777777777778],
  'full_intervention_scores': [0.5222222222222223, 0.5166666666666667]},
 'hard_negative_score': {'mean': 1.7455535239361701,
  'stdv': 0.0005402260638297518},
 'hard_negative_score_change': {'mean': 0.02055352393617016,
  'stdv': 0.0005402260638297518},
 'n': 2}

In [13]:
with open("memit_results.test.final.json", "w") as fh:
    json.dump(results, fh)

In [14]:
results.keys()

dict_keys(['prefix', 'oracle'])

In [15]:
results['prefix']['pythia-1.4b']['gender'][0.001]

{'intervention_score': {'mean': 0.8188888888888888,
  'stdv': 0.013310165056345523},
 'success_rate_change': {'mean': 0.05611111111111111,
  'stdv': 0.013310165056345523,
  'full_baseline_intervention': [0.875, 0.875, 0.875, 0.875, 0.875],
  'full_intervention_scores': [0.8305555555555556,
   0.8305555555555556,
   0.7944444444444444,
   0.8166666666666667,
   0.8222222222222222]},
 'hard_negative_score': {'mean': 1.923723404255319,
  'stdv': 0.003681935079547429},
 'hard_negative_score_change': {'mean': 0.01153922872340427,
  'stdv': 0.003681935079547429},
 'n': 5}

### Pre 10-07

In [None]:
raise # don't run below

In [57]:
import os 
import json
from collections import defaultdict
import numpy as np

In [64]:
# baseline_scores = {}
# for root, dirs, files in os.walk("log_memit_100223"):
#     for fname in files:
#         if 'noedit' in fname:
#             # print(fname)
#             model_name = fname.split('__')[0].split('noedit.')[1]
#             dataset_name = fname.split('__')[1].split('-')[0]
#             data = json.load(open(os.path.join(root, fname), 'r'))
            
#             if model_name not in baseline_scores:
#                 baseline_scores[model_name] = {}

#             baseline_scores[model_name][dataset_name] = data
# print(baseline_scores.keys())
# print(baseline_scores['pythia-6.9b']['gender'])
# with open("memit_results.noedit.val.v2.json", "w") as fh:
#     json.dump(baseline_scores, fh)
    
baseline_scores = {}
for root, dirs, files in os.walk("log_memit_100223_test_results"):
    for fname in files:
        if 'noedit' in fname:
            # print(fname)
            model_name = fname.split('__')[0].split('noedit.')[1]
            dataset_name = fname.split('__')[1].split('-')[0]
            data = json.load(open(os.path.join(root, fname), 'r'))
            
            if model_name not in baseline_scores:
                baseline_scores[model_name] = {}

            baseline_scores[model_name][dataset_name] = data
print(baseline_scores.keys())

with open("memit_results.noedit.test.v4.json", "w") as fh:
    json.dump(baseline_scores, fh)


dict_keys(['backpack-gpt2', 'pythia-160m', 'pythia-1.4b', 'pythia-6.9b', 'pythia-410m', 'pythia-1b', 'pythia-70m', 'pythia-2.8b'])


In [67]:
mname = 'pythia-70m' #  'pythia-6.9b' # 
dname = 'country' # 'company' # 'gender'

In [68]:
baseline_scores[mname][dname]

{'intervention_score': 0.9965694682675815,
 'general_score': 5.6242944101325145,
 'rest_of_prompt_score': 6.127375163172184,
 'hard_negative_score': 20.467924446895204}

In [69]:
with open("memit_results.noedit.test.v3.withWrap.json", "r") as fh:
    results_read = json.load(fh)
results_read[mname][dname]

{'intervention_score': 0.9965694682675815,
 'general_score': 5.6242944101325145,
 'rest_of_prompt_score': 6.127375163172184,
 'hard_negative_score': 20.467924446895204}

In [63]:
with open("memit_results.noedit.test.v2.json", "r") as fh:
    results_read = json.load(fh)
results_read[mname][dname]

{'intervention_score': 0.08611111111111111,
 'general_score': 5.328501239480092,
 'rest_of_prompt_score': 7.299414927261227,
 'hard_negative_score': 4.0950797872340425}

In [3]:
# load test results
test_results = defaultdict(list)
for root, dirs, files in os.walk("log_memit_100223_test_results"):
    for fname in files:
        if 'noedit' in fname:
            continue
        vals = fname[:-5].split('__')
        exp_id = '__'.join(vals[:-1])
    
        with open(os.path.join("log_memit_100223_test_results", fname), 'r') as fh:
            data = json.load(fh)
            test_results[exp_id].append(data)

for k in test_results:
    print(k, len(test_results[k]))


pythia-70m__country-prefix_subject__0.0001 5
pythia-2.8b__verbs-true_subject__1e-05 5
pythia-2.8b__country-true_subject__0.0001 5
pythia-160m__verbs-true_subject__1e-05 5
pythia-160m__temporal-prefix_subject__1e-05 5
pythia-160m__country-true_subject__0.0001 5
pythia-70m__temporal-prefix_subject__1e-05 5
pythia-160m__country-prefix_subject__0.0001 5
pythia-2.8b__gender-prefix_subject__1e-05 5
pythia-410m__company-prefix_subject__1e-05 5
pythia-1b__gender-prefix_subject__0.0001 5
pythia-70m__temporal-prefix_subject__0.0001 5
pythia-1b__verbs-prefix_subject__1e-05 5
pythia-2.8b__company-true_subject__0.0001 5
pythia-2.8b__verbs-prefix_subject__0.0001 5
pythia-160m__company-true_subject__0.0001 5
pythia-6.9b__stereoset-prefix_subject__0.0001 5
pythia-160m__verbs-prefix_subject__0.0001 5
pythia-6.9b__company-prefix_subject__0.0001 5
pythia-410m__gender-prefix_subject__1e-05 5
pythia-160m__gender-true_subject__0.0001 5
pythia-160m__gender-prefix_subject__1e-05 5
pythia-1.4b__temporal-true_s

In [4]:
def get_test_results(model_name, league, dname, subject_type, return_dicts=False):
    exp_id = f'{model_name}__{dname}-{subject_type}_subject__{league}'
    league_cutoff = baseline_scores[model_name][dname]['general_score'] * (1+league)

    # print("exp_id", exp_id)
    # print("league_cutoff", league_cutoff)

    general_scores = np.array([x['general_score'] for x in test_results[exp_id]])
    intervention_scores = np.array([x['intervention_score'] for x in test_results[exp_id]])
    hard_negative_scores = np.array([x['hard_negative_score'] for x in test_results[exp_id]])

    # print(general_scores.shape, intervention_scores.shape)

    # get rid of invalid entries
    indexer = general_scores < league_cutoff
    general_scores = general_scores[indexer]
    intervention_scores = intervention_scores[indexer]
    hard_negative_scores = hard_negative_scores[indexer]
    assert ((np.array(general_scores) < league_cutoff).all())

    baseline_intervention = baseline_scores[model_name][dname]['intervention_score'] * (1+league)
    baseline_hard_negative = baseline_scores[model_name][dname]['hard_negative_score']
    # print("baseline_intervention", baseline_intervention)
    
    # print(np.var(intervention_scores))
    
    # print(np.var(hard_negative_scores))


    # print('success delta:', baseline_intervention - np.mean(intervention_scores) )
    # print('hard negative score:', np.mean(hard_negative_scores))

    if return_dicts:
        return {
            'intervention_score': {
                'mean': np.mean(intervention_scores),
                'stdv': np.std(intervention_scores),
            },
            'success_rate_change': {
                'mean': baseline_intervention - np.mean(intervention_scores),
                'stdv': np.std(intervention_scores),
            },
            'hard_negative_score': {
                'mean': np.mean(hard_negative_scores),
                'stdv': np.std(hard_negative_scores),
            },
            'n': len(general_scores),
        }
    return baseline_intervention - np.mean(intervention_scores), np.mean(hard_negative_scores)

# model_name = 'pythia-1.4b' # 'pythia-410m'
# league = 1e-3
# dname = 'gender' # 'country'
# subject_type = 'prefix'

# get_test_results(model_name, league, dname, subject_type)

## make a table

In [6]:
print_table = False 
results = {}
hard_negs = {}
for subject_type in subject_types:
    results[subject_type] = {}
    if print_table:
        print('='*40)
        print(subject_type, 'subject')
    for model_name in model_names:
        results[subject_type][model_name] = {}
        if print_table:
            print('\n')
            print("model_name", model_name)
        for dname in dnames:
            results[subject_type][model_name][dname] = {}
            for league in leagues:
                results[subject_type][model_name][dname][league] = get_test_results(model_name, league, dname, subject_type, return_dicts=True)
                # print(f"{model_name}__{dname}-{s_type}__{league}")

            # if print_table:
            # print(dname, *success_deltas[model_name], sep='\t')
            # print(dname, *hard_negs[model_name], sep='\t')
            

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [7]:
results['true']['pythia-70m']

{'company': {0.001: {'intervention_score': {'mean': 1.0, 'stdv': 0.0},
   'success_rate_change': {'mean': 0.0009999999999998899, 'stdv': 0.0},
   'hard_negative_score': {'mean': 24.573002810891904,
    'stdv': 0.0017726167701983344},
   'n': 5},
  0.0001: {'intervention_score': {'mean': nan, 'stdv': nan},
   'success_rate_change': {'mean': nan, 'stdv': nan},
   'hard_negative_score': {'mean': nan, 'stdv': nan},
   'n': 0},
  1e-05: {'intervention_score': {'mean': nan, 'stdv': nan},
   'success_rate_change': {'mean': nan, 'stdv': nan},
   'hard_negative_score': {'mean': nan, 'stdv': nan},
   'n': 0}},
 'country': {0.001: {'intervention_score': {'mean': 0.9965694682675815,
    'stdv': 0.0},
   'success_rate_change': {'mean': 0.00271355060034284, 'stdv': 0.0},
   'hard_negative_score': {'mean': 14.523434034298205,
    'stdv': 0.004414291341329034},
   'n': 5},
  0.0001: {'intervention_score': {'mean': 0.9965694682675815, 'stdv': 0.0},
   'success_rate_change': {'mean': 0.00181509433962256

In [8]:
baseline_scores[model_name][dname]

{'intervention_score': 0.8888888888888888,
 'general_score': 2.41024465142832,
 'rest_of_prompt_score': 4.991421568627451,
 'hard_negative_score': 1.6733304936835107}

In [27]:
results['prefix']['backpack-gpt2']['company'][0.0001]

{'intervention_score': {'mean': 0.9703529411764705,
  'stdv': 0.0011527010554274173},
 'success_rate_change': {'mean': 0.0015089411764707128,
  'stdv': 0.0011527010554274173},
 'hard_negative_score': {'mean': 18.16626426836127,
  'stdv': 0.0008513854040222031},
 'n': 5}

In [18]:
with open("memit_test_results.json", "w") as fh:
    json.dump(results, fh)

In [19]:
with open("memit_test_results.noedit.json", "w") as fh:
    json.dump(baseline_scores, fh)

In [20]:
with open("memit_test_results.json", "r") as fh:
    results_read = json.load(fh)

In [25]:
with open("memit_test_results.noedit.json", "r") as fh:
    results_no_edit = json.load(fh)

In [26]:
results_no_edit[model_name][dataset_name]

{'intervention_score': 0.4638888888888889,
 'general_score': 2.41024465142832,
 'rest_of_prompt_score': 5.561131437059859,
 'hard_negative_score': 47.11877170138889}

In [28]:
!pwd

/juice4/scr4/sachen/backpack_project/backpack-guarantees/memit/notebooks


In [29]:
!ls *.json

memit_test_results.json  memit_test_results.noedit.json
