In [157]:
import os
from os import path
import glob
import pandas as pd
import json
from collections import defaultdict

In [187]:
perf_dir = "/share/data/speech/shtoshni/research/litbank_coref/models/perf/"
slurm_id = "6101258"

files = sorted(glob.glob(path.join(perf_dir, slurm_id + "*")), key=lambda x: int(path.splitext(path.basename(x))[0].split('_')[1]))
print(len(files))

100


In [159]:
model_dict_list = []
for file in files:
    model_dict_list.append(json.loads(open(file).read()))
    
# print(model_dict_list)

In [190]:
def determine_varying_attributes(model_dict_list, ignore_attribs=['train', 'test', 'dev', 
                                                                  'slurm_id', 'best_model_dir', 'data_dir']):
    attrib_to_vals = defaultdict(set)
    for model_dict in model_dict_list:
        for attrib, val in model_dict.items():
            if attrib in ignore_attribs:
                continue
            else:
                attrib_to_vals[attrib].add(val)
    
    
    varying_attribs = []
    for attrib, vals in attrib_to_vals.items():
        if len(vals) > 1:
            varying_attribs.append(attrib)
            
    return varying_attribs

In [191]:
varying_attribs = determine_varying_attributes(model_dict_list)
perf_attribs = ['MUC' , 'Bcub', 'CEAFE']
print(varying_attribs)

['model_dir', 'mem_type', 'num_cells', 'sample_invalid', 'label_smoothing_wt']


In [192]:
perf_df = pd.DataFrame(columns=(varying_attribs +  ['devf'] + ['testf'] + perf_attribs))

for model_dict in model_dict_list:
    perf_dict = {}
    for attrib in varying_attribs:
        perf_dict[attrib] = model_dict[attrib]
        
    
    for perf_attrib in perf_attribs:
        if perf_attrib in model_dict['test']:
            attrib_dict = model_dict['test'][perf_attrib]
            perf_dict[perf_attrib] = (attrib_dict['recall'], attrib_dict['precision'], attrib_dict['fscore'])
        else:
            perf_dict[perf_attrib] = '-'
                
    perf_dict['devf'] = model_dict['dev']['fscore']
    perf_dict['testf'] = model_dict['test']['fscore']
    perf_df = perf_df.append(perf_dict, ignore_index=True)

In [193]:
perf_df = perf_df.rename(columns={"label_smoothing_wt": "ls_wt", "sample_invalid": "samp"})
idx = perf_df.groupby(['mem_type', 'num_cells'])['devf'].idxmax()
dev_max = perf_df.iloc[idx]
dev_max

Unnamed: 0,model_dir,mem_type,num_cells,samp,ls_wt,devf,testf,MUC,Bcub,CEAFE
84,/share/data/speech/shtoshni/research/litbank_c...,learned,5,0.75,0.0,74.0,73.3,"(76.4, 86.2, 81.0)","(66.4, 78.4, 71.9)","(62.0, 72.7, 66.9)"
85,/share/data/speech/shtoshni/research/litbank_c...,learned,10,0.75,0.0,77.1,76.8,"(81.7, 85.9, 83.8)","(72.8, 77.9, 75.3)","(67.0, 76.4, 71.4)"
66,/share/data/speech/shtoshni/research/litbank_c...,learned,20,0.5,0.1,78.1,78.2,"(83.2, 86.2, 84.7)","(74.8, 78.9, 76.8)","(70.0, 76.7, 73.2)"
67,/share/data/speech/shtoshni/research/litbank_c...,learned,30,0.5,0.1,78.2,78.5,"(83.8, 85.6, 84.7)","(76.1, 78.2, 77.1)","(70.4, 77.1, 73.6)"
44,/share/data/speech/shtoshni/research/litbank_c...,lru,5,0.75,0.0,69.8,69.6,"(72.0, 85.7, 78.3)","(60.1, 78.9, 68.2)","(57.0, 68.9, 62.4)"
45,/share/data/speech/shtoshni/research/litbank_c...,lru,10,0.75,0.0,75.9,75.5,"(80.1, 85.7, 82.8)","(70.5, 78.3, 74.2)","(66.0, 73.4, 69.5)"
26,/share/data/speech/shtoshni/research/litbank_c...,lru,20,0.5,0.1,78.2,77.8,"(82.8, 85.9, 84.3)","(74.8, 78.3, 76.5)","(68.0, 77.4, 72.4)"
15,/share/data/speech/shtoshni/research/litbank_c...,lru,30,0.25,0.1,78.6,78.5,"(84.0, 85.2, 84.6)","(76.2, 78.2, 77.2)","(72.1, 75.6, 73.8)"
0,/share/data/speech/shtoshni/research/litbank_c...,unbounded,20,0.25,0.1,78.4,78.1,"(84.6, 84.1, 84.3)","(77.2, 76.2, 76.7)","(72.5, 74.3, 73.4)"


### Get varying memory type and memory size configurations

In [197]:
z = dev_max.groupby(['mem_type','num_cells']).size()

multindex = z.axes[0]
mem_types = list(multindex.get_level_values(0))
num_cells = list(multindex.get_level_values(1))

print(mem_types, num_cells)

['learned', 'learned', 'learned', 'learned', 'lru', 'lru', 'lru', 'lru', 'unbounded'] [5, 10, 20, 30, 5, 10, 20, 30, 20]


### Get location of all conll output files

In [207]:
SPLIT = 'dev'
# SPLIT = 'test'


model_config_to_conll_files = []
for mem_type, num_cell in zip(mem_types, num_cells):
    model_config =  (mem_type, num_cell)
    
    model_dir = dev_max.loc[(dev_max['mem_type'] == mem_type) & (dev_max['num_cells'] == num_cell)]['model_dir'].values[0]
    conll_file = path.join(model_dir, f'{SPLIT}.conll')
    json_file = path.join(model_dir, f'{SPLIT}.log.jsonl')
    model_config_to_conll_files.append((model_config, conll_file, json_file))

### Concat all Cross Val CoNLLs and JSONLs

In [208]:
output_dir = "../models/ontonotes_preds/"
if not path.exists(output_dir):
    os.makedirs(output_dir)


model_config_output_file_list = []    
for model_config, conll_file, jsonl_file in model_config_to_conll_files:
    conll_output_file = path.join(output_dir, f'{model_config[0]}_{model_config[1]}_{SPLIT}.conll')
    jsonl_output_file = path.join(output_dir, f'{model_config[0]}_{model_config[1]}_{SPLIT}.jsonl')
    
    model_config_output_file_list.append((model_config, conll_output_file))
    
    # Copying CoNLL file
    with open(conll_output_file, "w") as output_w:
        with open(conll_file) as g:
            for line in g:
                output_w.write(line)
    
    # Copying JSONL file
    with open(jsonl_output_file, "w") as output_w:
        with open(jsonl_file) as g:
            for line in g:
                output_w.write(line)

In [194]:
rows = dev_max[['mem_type', 'devf', 'testf']].to_csv(index=False).replace(',', ' & ').strip('\n').split('\n')

for row in rows:
    print( "\\" + row + "  \\\\")

\mem_type & devf & testf  \\
\learned & 74.0 & 73.3  \\
\learned & 77.1 & 76.8  \\
\learned & 78.1 & 78.2  \\
\learned & 78.2 & 78.5  \\
\lru & 69.8 & 69.6  \\
\lru & 75.9 & 75.5  \\
\lru & 78.2 & 77.8  \\
\lru & 78.6 & 78.5  \\
\unbounded & 78.4 & 78.1  \\


In [195]:
rows = dev_max[['mem_type', 'MUC', 'Bcub', 'CEAFE', 'testf']].to_csv(index=False).replace(',', ' & ').replace('"(', ' ').replace(')"', ' ').strip('\n').split('\n')

for row in rows:
    print( "\\" + row + "  \\\\")

\mem_type & MUC & Bcub & CEAFE & testf  \\
\learned &  76.4 &  86.2 &  81.0  &  66.4 &  78.4 &  71.9  &  62.0 &  72.7 &  66.9  & 73.3  \\
\learned &  81.7 &  85.9 &  83.8  &  72.8 &  77.9 &  75.3  &  67.0 &  76.4 &  71.4  & 76.8  \\
\learned &  83.2 &  86.2 &  84.7  &  74.8 &  78.9 &  76.8  &  70.0 &  76.7 &  73.2  & 78.2  \\
\learned &  83.8 &  85.6 &  84.7  &  76.1 &  78.2 &  77.1  &  70.4 &  77.1 &  73.6  & 78.5  \\
\lru &  72.0 &  85.7 &  78.3  &  60.1 &  78.9 &  68.2  &  57.0 &  68.9 &  62.4  & 69.6  \\
\lru &  80.1 &  85.7 &  82.8  &  70.5 &  78.3 &  74.2  &  66.0 &  73.4 &  69.5  & 75.5  \\
\lru &  82.8 &  85.9 &  84.3  &  74.8 &  78.3 &  76.5  &  68.0 &  77.4 &  72.4  & 77.8  \\
\lru &  84.0 &  85.2 &  84.6  &  76.2 &  78.2 &  77.2  &  72.1 &  75.6 &  73.8  & 78.5  \\
\unbounded &  84.6 &  84.1 &  84.3  &  77.2 &  76.2 &  76.7  &  72.5 &  74.3 &  73.4  & 78.1  \\
