In [113]:
import os
from os import path
import glob
import pandas as pd
import json
from collections import defaultdict

In [114]:
perf_dir = "/share/data/speech/shtoshni/research/litbank_coref/models/perf/"
slurm_id = "6550136"  # "6549853"


files = sorted(glob.glob(path.join(perf_dir, slurm_id + "*")), key=lambda x: int(path.splitext(path.basename(x))[0].split('_')[1]))

print(len(files))

100


In [115]:
model_dict_list = []
for file in files:
    model_dict_list.append(json.loads(open(file).read()))
    
# print(model_dict_list)

In [116]:
def determine_varying_attributes(model_dict_list, ignore_attribs=['train', 'test', 'test_1', 'dev', 'dev_1',  'pretrained_mention_model',
                                                                  'conll_data_dir','slurm_id', 'best_model_dir', 'data_dir']):
    attrib_to_vals = defaultdict(set)
    for model_dict in model_dict_list:
        for attrib, val in model_dict.items():
            if attrib in ignore_attribs or isinstance(attrib, dict):
                continue
            else:
                try:
                    attrib_to_vals[attrib].add(val)
                except TypeError:
                    pass
    
    varying_attribs = []
    for attrib, vals in attrib_to_vals.items():
        if len(vals) > 1:
            varying_attribs.append(attrib)
            
    return varying_attribs

In [117]:
varying_attribs = determine_varying_attributes(model_dict_list)
# perf_attribs = ['MUC' , 'Bcub', 'CEAFE']
perf_attribs = []
print(varying_attribs)

['model_dir', 'cross_val_split', 'alpha_ontonotes', 'sample_ontonotes_prob']


### Load all dev and test f-scores

In [118]:
perf_df = pd.DataFrame(columns=(varying_attribs + ['devf', 'fs'] + perf_attribs))

for model_dict in model_dict_list:
    perf_dict = {}
    for attrib in varying_attribs:
        perf_dict[attrib] = model_dict[attrib]
        
    
    for perf_attrib in perf_attribs:
        if perf_attrib in model_dict['test']:
            attrib_dict = model_dict['test'][perf_attrib]
            perf_dict[perf_attrib] = (attrib_dict['recall'], attrib_dict['precision'], attrib_dict['fscore'])
        else:
            perf_dict[perf_attrib] = '-'
            
    
    perf_dict['devf'] = model_dict['dev_litbank']['fscore']
    perf_dict['fs'] = model_dict['test_litbank']['fscore']
    
    perf_df = perf_df.append(perf_dict, ignore_index=True)

### Filter by maximum dev-scores among hyperparams

In [119]:
perf_df = perf_df.rename(columns={"label_smoothing_wt": "ls_wt", "sample_invalid": "samp", "max_training_segments": "segs"})
perf_df.groupby(['alpha_ontonotes', 'sample_ontonotes_prob'])['devf'].mean()

alpha_ontonotes  sample_ontonotes_prob
0.00             0.00                     77.34
                 0.05                     77.70
                 0.10                     77.93
                 0.25                     77.50
0.10             0.05                     77.67
                 0.10                     77.67
                 0.25                     77.64
0.25             0.05                     77.52
                 0.10                     77.81
                 0.25                     77.63
Name: devf, dtype: float64

In [120]:
perf_df.groupby(['alpha_ontonotes', 'sample_ontonotes_prob'])['devf'].mean()

alpha_ontonotes  sample_ontonotes_prob
0.00             0.00                     77.34
                 0.05                     77.70
                 0.10                     77.93
                 0.25                     77.50
0.10             0.05                     77.67
                 0.10                     77.67
                 0.25                     77.64
0.25             0.05                     77.52
                 0.10                     77.81
                 0.25                     77.63
Name: devf, dtype: float64

In [121]:
print(perf_df[(perf_df['alpha_ontonotes'] == 0.0) & (perf_df['sample_ontonotes_prob'] == 0.0)])

                                           model_dir cross_val_split  \
0  /share/data/speech/shtoshni/research/litbank_c...               0   
1  /share/data/speech/shtoshni/research/litbank_c...               1   
2  /share/data/speech/shtoshni/research/litbank_c...               2   
3  /share/data/speech/shtoshni/research/litbank_c...               3   
4  /share/data/speech/shtoshni/research/litbank_c...               4   
5  /share/data/speech/shtoshni/research/litbank_c...               5   
6  /share/data/speech/shtoshni/research/litbank_c...               6   
7  /share/data/speech/shtoshni/research/litbank_c...               7   
8  /share/data/speech/shtoshni/research/litbank_c...               8   
9  /share/data/speech/shtoshni/research/litbank_c...               9   

   alpha_ontonotes  sample_ontonotes_prob  devf    fs  
0              0.0                    0.0  76.9  74.7  
1              0.0                    0.0  75.6  79.2  
2              0.0                    0

### Get location of all conll output files

In [122]:
# SPLIT = 'dev'
SPLIT = 'dev'


model_config_to_conll_files = []
for pair_val, indices in perf_df.groupby(['alpha_ontonotes', 'sample_ontonotes_prob']).groups.items():
    if len(indices) == 10:
        print(pair_val)
        config_df = perf_df.loc[(perf_df['alpha_ontonotes'] == pair_val[0]) 
                                & (perf_df['sample_ontonotes_prob'] == pair_val[1])]
        conll_files = []
        json_files = []
        for cross_val_split in range(10):
            model_dir = config_df.loc[config_df['cross_val_split'] == cross_val_split]['model_dir'].values[0]
            conll_file = path.join(model_dir, f'{SPLIT}_litbank.conll')
            conll_files.append(conll_file)
            json_files.append(path.join(model_dir, f'{SPLIT}_litbank.log.jsonl'))

        model_config_to_conll_files.append((pair_val, conll_files, json_files))

(0.0, 0.0)
(0.0, 0.05)
(0.0, 0.1)
(0.0, 0.25)
(0.1, 0.05)
(0.1, 0.1)
(0.1, 0.25)
(0.25, 0.05)
(0.25, 0.1)
(0.25, 0.25)


### Concat all Cross Val CoNLLs and JSONLs

In [123]:
output_dir = "../models/litbank_preds/"
if not path.exists(output_dir):
    os.makedirs(output_dir)


model_config_output_file_list = []    
for model_config, conll_files, jsonl_files in model_config_to_conll_files:
    conll_output_file = path.join(output_dir, f'{model_config[0]}_{model_config[1]}_{SPLIT}.conll')
    jsonl_output_file = path.join(output_dir, f'{model_config[0]}_{model_config[1]}_{SPLIT}.jsonl')
    
    model_config_output_file_list.append((model_config, conll_output_file))
    
    with open(conll_output_file, "w") as output_w:
        for conll_file in conll_files:
            with open(conll_file) as g:
                for line in g:
                    output_w.write(line)
                    
    with open(jsonl_output_file, "w") as output_w:
        for jsonl_file in jsonl_files:
            with open(jsonl_file) as g:
                for line in g:
                    output_w.write(line)
                    
        

#### Setup coref evaluation script path and Gold CoNLL 

In [124]:
import sys
import subprocess
import re

gold_conll = f"/home/shtoshni/Research/litbank_coref/data/litbank/all.{SPLIT}.conll"
scorer_path = "/home/shtoshni/Research/litbank_coref/lrec2020-coref/reference-coreference-scorers/scorer.pl"

def get_coref_score(metric, path_to_scorer, gold=None, preds=None):
    output=subprocess.check_output(["perl", path_to_scorer, metric, preds, gold]).decode("utf-8")
    output=output.split("\n")[-3]
    matcher=re.search("Coreference: Recall: \(.*?\) (.*?)%	Precision: \(.*?\) (.*?)%	F1: (.*?)%", output)
    if matcher is not None:
        recall=float(matcher.group(1))
        precision=float(matcher.group(2))
        f1=float(matcher.group(3))
    return recall, precision, f1

In [125]:
metrics = ['MUC', 'Bcub', 'CEAFE']
for model_config, conll_file in model_config_output_file_list:
    print(f"\\{model_config[0]} & {model_config[1]}", end="")
    fscore_list = []
    for metric in metrics:
        recall, precision, fscore = get_coref_score(metric.lower(), scorer_path, gold_conll, conll_file)
        print(f" & {recall:.1f} & {precision:.1f} & {fscore:.1f} ", end="")
        fscore_list.append(fscore)
    
    print(f"& {sum(fscore_list)/len(fscore_list): .1f}")


\0.0 & 0.0 & 90.1 & 87.2 & 88.6  & 79.7 & 75.0 & 77.3  & 67.6 & 64.8 & 66.2 &  77.4
\0.0 & 0.05 & 90.6 & 87.3 & 89.0  & 80.2 & 75.7 & 77.8  & 66.9 & 66.0 & 66.4 &  77.7
\0.0 & 0.1 & 90.8 & 87.5 & 89.1  & 80.7 & 75.7 & 78.1  & 67.0 & 66.3 & 66.7 &  78.0
\0.0 & 0.25 & 90.6 & 87.3 & 88.9  & 80.1 & 75.2 & 77.6  & 66.5 & 65.7 & 66.1 &  77.5
\0.1 & 0.05 & 90.4 & 87.4 & 88.9  & 79.8 & 75.8 & 77.7  & 66.7 & 66.3 & 66.5 &  77.7
\0.1 & 0.1 & 90.7 & 87.1 & 88.9  & 80.5 & 75.1 & 77.7  & 67.6 & 65.5 & 66.5 &  77.7
\0.1 & 0.25 & 90.6 & 87.4 & 89.0  & 80.0 & 75.6 & 77.8  & 67.0 & 65.5 & 66.3 &  77.7
\0.25 & 0.05 & 90.6 & 87.3 & 89.0  & 80.4 & 75.0 & 77.6  & 67.0 & 65.2 & 66.1 &  77.5
\0.25 & 0.1 & 90.5 & 87.4 & 88.9  & 79.8 & 75.7 & 77.7  & 67.2 & 66.6 & 66.9 &  77.8
\0.25 & 0.25 & 90.3 & 87.5 & 88.9  & 79.6 & 75.6 & 77.5  & 67.3 & 65.6 & 66.5 &  77.6
