# Load MEMIT test results

In [1]:
# specify input params
script_dir = "sbatches_101023/test_scripts"
results_dir = "log_memit_101023_test_results"
# specify output path
results_dump_json = "memit_results.test.final.float16.json"

In [2]:
import os 
import json
from collections import defaultdict
import numpy as np

In [3]:
model_names = [
    # 'backpack-gpt2',
    'pythia-70m',
    'pythia-160m',
    'pythia-410m',
    'pythia-1b',
    'pythia-1.4b',
    'pythia-2.8b',
    'pythia-6.9b'
]
dnames = [
    'company', 
    'country', 
    'verbs', 
    'temporal', 
    'stereoset', 
    'gender'    
]
leagues = [1e-3, 1e-4, 1e-5]
subject_types = ['true', 'prefix']

In [4]:
fnames = []
for root, dirs, files in os.walk(script_dir):
    for fname in files:
        if 'noedit' not in fname:
            fnames.append(fname)
len(fnames)

230

In [5]:
exps = []
for fname in fnames:
    vals = fname[:-7].split('_')
    exps.append(vals[0] + '__' + vals[1] +'_' + vals[2] + '__' + vals[3] )
len(exps)

230

In [6]:
# load test results
test_results = defaultdict(list)
for root, dirs, files in os.walk(results_dir):
    for fname in files:
        if 'noedit' in fname:
            continue
        vals = fname[:-5].split('__')
        exp_id = '__'.join(vals[:-1])
    
        with open(os.path.join(results_dir, fname), 'r') as fh:
            data = json.load(fh)
            test_results[exp_id].append(data)

for k in sorted(test_results.keys()):
    if len(test_results[k]) != 5:
        print("Warning: did not find 5 runs for", k, len(test_results[k]))


In [7]:
for k in sorted(exps):
    assert k in test_results.keys()
for k in test_results.keys():
    assert k in exps

In [8]:
len(test_results.keys())

230

In [9]:
from make_sweep import model_name_to_short
def get_test_results(model_name, league, dname, subject_type, verbose=False):
    exp_id = f'{model_name_to_short(model_name)}__{dname}-{subject_type}_subject__{league}'

    # get rid of runs that are out-of-league
    options = []
    for exp_run in test_results[exp_id]:
        league_cutoff = exp_run['noedit']['general_score'] * (1+league)
        if exp_run['edit']['general_score'] < league_cutoff:
            options.append(exp_run)
    if verbose:
        print(f"{exp_id} has {len(test_results[exp_id])} entries, of which {len(options)} are in-league")

    general_scores = [exp_run['edit']['general_score'] for exp_run in options]
    intervention_scores = [exp_run['edit']['intervention_score'] for exp_run in options]
    hard_negative_scores = [exp_run['edit']['hard_negative_score'] for exp_run in options]

    baseline_intervention = [exp_run['noedit']['intervention_score'] for exp_run in options]
    baseline_hard_negative = [exp_run['noedit']['hard_negative_score'] for exp_run in options]

    success_rate_change = np.array(baseline_intervention) - np.array(intervention_scores)
    hard_negative_score_change = np.array(hard_negative_scores) - np.array(baseline_hard_negative)
    return {
        'intervention_score': {
            'mean': np.mean(intervention_scores),
            'stdv': np.std(intervention_scores),
        },
        'success_rate_change': {
            'mean': np.mean(success_rate_change),
            'stdv': np.std(success_rate_change),
            # 'full_baseline_intervention': baseline_intervention,
            # 'full_intervention_scores': intervention_scores,
        },
        'hard_negative_score': {
            'mean': np.mean(hard_negative_scores),
            'stdv': np.std(hard_negative_scores),
        },
        'hard_negative_score_change': {
            'mean': np.mean(hard_negative_score_change),
            'stdv': np.std(hard_negative_score_change),
        },
        'n': len(general_scores),
        'out_of': len(test_results[exp_id])
    }

In [10]:
results = {}
for subject_type in subject_types:
    results[subject_type] = {}
    for model_name in model_names:
        results[subject_type][model_name] = {}
        for dname in dnames:
            results[subject_type][model_name][dname] = {}
            for league in leagues:
                results[subject_type][model_name][dname][league] = get_test_results(model_name, league, dname, subject_type)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


In [11]:
results['oracle'] = results['true']
del results['true']

In [12]:
# peek at results for a specific set up
model_name = 'pythia-6.9b'
dname = 'gender'
league = 1e-5
results['oracle'][model_name][dname][league]

{'intervention_score': {'mean': 0.5666666666666667,
  'stdv': 0.0072435582280029175},
 'success_rate_change': {'mean': 0.3222222222222222,
  'stdv': 0.0072435582280029175},
 'hard_negative_score': {'mean': 1.6789426113696808,
  'stdv': 0.00294959467269748},
 'hard_negative_score_change': {'mean': 0.005612117686170137,
  'stdv': 0.00294959467269748},
 'n': 5,
 'out_of': 5}

In [13]:
with open(results_dump_json, "w") as fh:
    json.dump(results, fh)