In [1]:
import os
from os import path
import numpy as np
import json

from metrics import CorefEvaluator
from scipy.stats import spearmanr

In [2]:
log_dir = "/home/shtoshni/Research/litbank_coref/models/ontonotes_preds"

models = ["unbounded", "unbounded_no_ignore", "learned", "lru"]
num_cells = ["5", "10", "20"]


In [3]:
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            data.append(json.loads(line.strip()))
    
    return data

In [4]:
# Load logs here
model_to_logs = {}
for model in models:
    if model == 'unbounded':
        model_file = path.join(log_dir, f"{model}_20_dev.jsonl")  # 20 is just the default num_cells
        model_to_logs['unbounded'] = load_jsonl(model_file)
        
    elif model == 'unbounded_no_ignore':
        model_file = path.join(log_dir, f"unbounded_20_no_ignore_dev.jsonl")  # 20 is just the default num_cells
        model_to_logs['unbounded_no_ignore'] = load_jsonl(model_file)
                
    else:
        for num_cell in num_cells:
            model_file = path.join(log_dir, f"{model}_{num_cell}_dev.jsonl")
            model_to_logs[f"{model}_{num_cell}"] = load_jsonl(model_file)
        

In [5]:
from collections import Counter

for model, log_data in model_to_logs.items():
    action_counter = Counter()
    for example in log_data:
        _, pred_actions = zip(*example["pred_actions"])
        for action in pred_actions:
            action_counter[action] += 1
            
    
    print(model, action_counter)
        

unbounded Counter({'i': 53774, 'c': 14468, 'o': 5505})
unbounded_no_ignore Counter({'o': 59323, 'c': 14424})
learned_5 Counter({'i': 55158, 'c': 12864, 'o': 5617, 'n': 108})
learned_10 Counter({'i': 55775, 'c': 13607, 'o': 4365})
learned_20 Counter({'i': 55020, 'c': 13826, 'o': 4901})
lru_5 Counter({'i': 54612, 'c': 12162, 'o': 5690, 'n': 1283})
lru_10 Counter({'i': 54975, 'c': 13431, 'o': 5213, 'n': 128})
lru_20 Counter({'i': 54951, 'c': 13850, 'o': 4927, 'n': 19})


### Mention Ignored - Learned vs LRU

In [6]:
for model, logs in model_to_logs.items():
    mem_usage = []
    for log in logs:
        over_action = sum([1  for action in log["pred_actions"] if action[1]=='n' ])
    
        if 'unbounded' not in model:
            num_cells = int(model.split('_')[-1])
#             over_action = min(over_action, num_cells)

        mem_usage.append(over_action) 
    
    
    print ('{}, avg: {:.1f}'.format(model,  np.mean(mem_usage)))

unbounded, avg: 0.0
unbounded_no_ignore, avg: 0.0
learned_5, avg: 0.3
learned_10, avg: 0.0
learned_20, avg: 0.0
lru_5, avg: 3.7
lru_10, avg: 0.4
lru_20, avg: 0.1


### Number of Entities in Memory 

In [7]:
for model, logs in model_to_logs.items():
    mem_usage = []
    for log in logs:
        over_action = sum([1  for action in log["pred_actions"] if action[1]=='o' ])
    
        if 'unbounded' not in model:
            num_cells = int(model.split('_')[-1])
            over_action = min(over_action, num_cells)

        mem_usage.append(over_action) 
        
    print ('{}, avg: {:.1f}, max: {}'.format(model, np.mean(mem_usage), max(mem_usage)))
    


unbounded, avg: 16.0, max: 83
unbounded_no_ignore, avg: 173.0, max: 962
learned_5, avg: 4.6, max: 5
learned_10, avg: 7.8, max: 10
learned_20, avg: 12.1, max: 20
lru_5, avg: 4.6, max: 5
lru_10, avg: 7.9, max: 10
lru_20, avg: 11.9, max: 20


## Spearman Correlations

In [9]:
def mention_to_cluster(clusters, threshold=1):
    clusters = [tuple(tuple(mention) for mention in cluster)
                for cluster in clusters if len(cluster) >= threshold]
    mention_to_cluster = {}
    for cluster in clusters:
        for mention in cluster:
            mention_to_cluster[mention] = cluster
    return clusters, mention_to_cluster


model_perf_per_example = {}
for model in model_to_logs:
    log_data = model_to_logs[model]
    perf_list = []
    for example in log_data:
        evaluator = CorefEvaluator()
                
        predicted_clusters, mention_to_predicted =\
            mention_to_cluster(example["predicted_clusters"], threshold=2)
        gold_clusters, mention_to_gold =\
            mention_to_cluster(example["clusters"], threshold=2)


        evaluator.update(predicted_clusters, gold_clusters,
                         mention_to_predicted, mention_to_gold)
        
        doc_len = example["subtoken_map"][-1] + 1
        num_ents = len(gold_clusters)
        
        example_fscore = evaluator.get_prf()[2] * 100.0
        
        perf_list.append((doc_len, num_ents, example_fscore, example))
        
        
    model_perf_per_example[model] = perf_list
        

### Correlations with doc len and Num Ent

In [10]:
for model in model_perf_per_example:
    perf_list = model_perf_per_example[model]
    doc_len_list, num_ent_list, fscore, _ = zip(*perf_list)

    print('{} doc len {:.2f} num ent {:.2f}'.format(model, spearmanr(doc_len_list, fscore)[0], 
                                                    spearmanr(num_ent_list, fscore)[0]))

unbounded doc len -0.31 num ent -0.28
unbounded_no_ignore doc len -0.28 num ent -0.25
learned_5 doc len -0.36 num ent -0.37
learned_10 doc len -0.34 num ent -0.33
learned_20 doc len -0.34 num ent -0.31
lru_5 doc len -0.37 num ent -0.41
lru_10 doc len -0.29 num ent -0.30
lru_20 doc len -0.31 num ent -0.29


In [11]:
for num_cells in [5, 10, 20]:
    perf_learned_list = model_perf_per_example[f'learned_{num_cells}']
    perf_lru_list = model_perf_per_example[f'lru_{num_cells}']

    learned_better = []
    lru_better = []
    for i in range(len(perf_learned_list)):
        perf_learned = perf_learned_list[i]
        perf_lru = perf_lru_list[i]
        
        assert (perf_learned[3]['doc_key'] == perf_lru[3]['doc_key'] )
        perf_learned = perf_learned_list[i]
        perf_lru = perf_lru_list[i]

        diff_learned = perf_learned[2] - perf_lru[2] 
        if diff_learned > 0:
            learned_better.append((diff_learned, perf_learned[3]))

        diff_lru = -diff_learned
        if diff_lru > 0:
            lru_better.append((diff_lru, perf_lru[3]))

    
    print(f'Num cells {num_cells}')
    print('Learned better docs')
    learned_better = sorted(learned_better, key=lambda x: x[0], reverse=True)
    for (diff_score, example) in learned_better[:3]:
        print(f'{example["doc_key"]}, {diff_score:.1f}')
    print('LRU better docs')
    lru_better = sorted(lru_better, key=lambda x: x[0], reverse=True)
    for (diff_score, example) in lru_better[:3]:
        print(f'{example["doc_key"]}, {diff_score:.1f}')
    print()

Num cells 5
Learned better docs
nw/wsj/24/wsj_2401_0, 33.3
bn/voa/01/voa_0110_0, 28.0
bn/voa/00/voa_0040_0, 25.5
LRU better docs
nw/xinhua/01/chtb_0170_0, 20.1
nw/xinhua/02/chtb_0210_0, 15.6
nw/xinhua/01/chtb_0130_0, 13.0

Num cells 10
Learned better docs
nw/wsj/24/wsj_2401_0, 33.3
bn/voa/01/voa_0100_0, 30.0
bn/cnn/02/cnn_0230_0, 21.8
LRU better docs
nw/wsj/24/wsj_2440_0, 13.2
bc/msnbc/00/msnbc_0000_1, 11.8
nw/wsj/24/wsj_2423_0, 11.0

Num cells 20
Learned better docs
bn/cnn/00/cnn_0060_0, 22.7
bn/voa/00/voa_0090_0, 16.8
nw/xinhua/00/chtb_0050_0, 16.6
LRU better docs
nw/xinhua/01/chtb_0130_0, 14.2
nw/xinhua/00/chtb_0030_0, 13.5
bc/msnbc/00/msnbc_0000_10, 12.6

