In [19]:
import os
from os import path
import numpy as np
import json

from metrics import CorefEvaluator
from scipy.stats import spearmanr

In [20]:
log_dir = "/home/shtoshni/Research/litbank_coref/models/litbank_logs"

models = ["unbounded", "learned", "lru"]
num_cells = ["5", "10", "20"]


In [21]:
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            data.append(json.loads(line.strip()))
    
    return data


def mention_to_cluster(clusters, threshold=2):
    clusters = [tuple(tuple(mention) for mention in cluster)
                for cluster in clusters if len(cluster) >= threshold]
    mention_to_cluster = {}
    for cluster in clusters:
        for mention in cluster:
            mention_to_cluster[mention] = cluster
    return clusters, mention_to_cluster

def get_cluster_spans(coref_chains):
    cluster_spans = []
    for chain in coref_chains:
        min_start = 1e9
        max_end = 0
        for (span_start, span_end) in chain:
            min_start = min(span_start, min_start)
            max_end = max(span_end, max_end)
            
        cluster_spans.append((min_start, max_end))
    return cluster_spans

def get_max_active_chains(cluster_spans):
    last_mention_idx = max([span_end for _, span_end in cluster_spans])
    num_clusters = len(cluster_spans)
    
    # Boolean variable that marks whether a cluster is active at certain time step or not
    active_chains = np.zeros((num_clusters, last_mention_idx), dtype=np.int32)
    for idx, (span_start, span_end) in enumerate(cluster_spans):
        active_chains[idx, span_start:span_end + 1] = 1
    
    active_chain_sum = np.sum(active_chains, axis=0)
    assert(last_mention_idx == len(active_chain_sum)) # Just to make sure summing over right dimension
    
    return np.max(active_chain_sum), np.argmax(active_chain_sum)

In [22]:
# Load logs here

model_to_logs = {}
for model in models:
    if model == 'unbounded':
        model_to_logs['unbounded'] = []
        for i in range(1, 11):
                model_file = path.join(log_dir, f"log_{i}.jsonl")
                model_to_logs['unbounded'].extend(load_jsonl(model_file))
                
    elif model == 'learned':
        for j, num_cell in enumerate(num_cells):
            model_name = f'learned_{num_cell}'
            model_to_logs[model_name] = []
            offset = 20 + j + 1
            for i in range(1, 11):
                model_file = path.join(log_dir, f"log_{offset + (i - 1)* 3}.jsonl")
                model_to_logs[model_name].extend(load_jsonl(model_file))
#                 print(model_file)
                
    elif model == 'lru':
        for j, num_cell in enumerate(num_cells):
            model_name = f'lru_{num_cell}'
            model_to_logs[model_name] = []
            offset = 80 + j + 1
            for i in range(1, 11):
                model_file = path.join(log_dir, f"log_{offset + (i - 1)* 3}.jsonl")
                model_to_logs[model_name].extend(load_jsonl(model_file))
#                 print(model_file) 

In [34]:
from collections import Counter

for model, log_data in model_to_logs.items():
    action_counter = Counter()
    for example in log_data:
        _, pred_actions = zip(*example["pred_actions"])
        for action in pred_actions:
            action_counter[action] += 1
            
    
    print(model, action_counter)
        

unbounded Counter({'i': 40502, 'c': 21281, 'o': 9687})
learned_5 Counter({'i': 41953, 'c': 19113, 'o': 8573, 'n': 1831})
learned_10 Counter({'i': 42925, 'c': 20350, 'o': 8195})
learned_20 Counter({'i': 42934, 'c': 20575, 'o': 7961})
lru_5 Counter({'i': 40953, 'c': 17699, 'n': 8319, 'o': 4499})
lru_10 Counter({'i': 40807, 'c': 19150, 'o': 8067, 'n': 3446})
lru_20 Counter({'i': 41249, 'c': 20134, 'o': 9392, 'n': 695})


In [23]:
## Calculate Statistics for docs
doc_key_to_stat = {}
for example in model_to_logs['unbounded']:
    cluster_spans = get_cluster_spans(example["clusters"])
    max_active_chains, _ = get_max_active_chains(cluster_spans)
    sorted_cluster_lens = sorted([len(cluster) for cluster in example["clusters"]], reverse=True)
    doc_key_to_stat[example["doc_key"]] = [max_active_chains, sorted_cluster_lens]

In [24]:
model_perf_per_example = {}
for model in model_to_logs:
    log_data = model_to_logs[model]
    perf_list = []
    for example in log_data:
        evaluator = CorefEvaluator()
                
        predicted_clusters, mention_to_predicted =\
            mention_to_cluster(example["predicted_clusters"], threshold=1)
        gold_clusters, mention_to_gold =\
            mention_to_cluster(example["clusters"], threshold=1)


        evaluator.update(predicted_clusters, gold_clusters,
                         mention_to_predicted, mention_to_gold)
        
        doc_len = example["subtoken_map"][-1] + 1
        num_ents = len(gold_clusters)
        
        example_fscore = evaluator.get_prf()[2] * 100.0
        
        perf_list.append((doc_len, num_ents, example_fscore, example))
        
        
    model_perf_per_example[model] = perf_list
        

In [25]:
for num_cells in [5, 10, 20]:
    perf_learned_list = model_perf_per_example[f'learned_{num_cells}']
    perf_lru_list = model_perf_per_example[f'lru_{num_cells}']

    learned_better = []
    lru_better = []
    for i in range(len(perf_learned_list)):
        perf_learned = perf_learned_list[i]
        perf_lru = perf_lru_list[i]
        
        assert (perf_learned[3]['doc_key'] == perf_lru[3]['doc_key'] )
        perf_learned = perf_learned_list[i]
        perf_lru = perf_lru_list[i]

        diff_learned = perf_learned[2] - perf_lru[2] 
        if diff_learned > 0:
            learned_better.append((diff_learned, perf_learned[3]))

        diff_lru = -diff_learned
        if diff_lru > 0:
            lru_better.append((diff_lru, perf_lru[3]))

    
    print(f'Num cells {num_cells}')
    print('Learned')
    learned_better = sorted(learned_better, key=lambda x: x[0], reverse=True)
    for (diff_score, example) in learned_better[:3]:
        print(example["doc_key"], diff_score)
    print('LRU')
    lru_better = sorted(lru_better, key=lambda x: x[0], reverse=True)
    for (diff_score, example) in lru_better[:3]:
        print(example["doc_key"], diff_score)
    print()

Num cells 5
Learned
8867_the_magnificent_ambersons_brat_0 56.62002043933275
1695_the_man_who_was_thursday_a_nightmare_brat_0 32.18289714555622
6593_history_of_tom_jones_a_foundling_brat_0 30.993628443966927
LRU
27_far_from_the_madding_crowd_brat_0 2.445963041747447
711_allan_quatermain_brat_0 0.2874241387266352
766_david_copperfield_brat_0 0.05632215920590511

Num cells 10
Learned
8867_the_magnificent_ambersons_brat_0 53.86864083109797
1695_the_man_who_was_thursday_a_nightmare_brat_0 29.38083563288459
76_adventures_of_huckleberry_finn_brat_0 25.365012970136277
LRU
174_the_picture_of_dorian_gray_brat_0 4.801842814451007
940_the_last_of_the_mohicans_a_narrative_of_1757_brat_0 2.629418497926274
367_country_of_the_pointed_firs_brat_0 0.30790983877787426

Num cells 20
Learned
8867_the_magnificent_ambersons_brat_0 38.46343589614495
76_adventures_of_huckleberry_finn_brat_0 18.9291788372921
514_little_women_brat_0 14.422134104461918
LRU
215_the_call_of_the_wild_brat_0 4.55510604288159
367_coun

### Mention Ignored - Learned vs LRU

In [26]:
for model, logs in model_to_logs.items():
    mem_usage = []
    for log in logs:
        over_action = sum([1  for action in log["pred_actions"] if action[1]=='n' ])
    
        if 'unbounded' not in model:
            num_cells = int(model.split('_')[-1])
#             over_action = min(over_action, num_cells)

        mem_usage.append(over_action) 
    
    
    print ('{}, max: {}, avg: {:.1f}'.format(model, max(mem_usage), np.mean(mem_usage)))

unbounded, max: 0, avg: 0.0
learned_5, max: 56, avg: 18.3
learned_10, max: 0, avg: 0.0
learned_20, max: 0, avg: 0.0
lru_5, max: 196, avg: 83.2
lru_10, max: 178, avg: 34.5
lru_20, max: 86, avg: 7.0


## Spearman Correlations

### Correlations with doc len and Num Ent

In [27]:
for model in model_perf_per_example:
    perf_list = model_perf_per_example[model]
    doc_len_list, num_ent_list, fscore, _ = zip(*perf_list)

    print('{} doc len {:.2f} num ent {:.2f}'.format(model, spearmanr(doc_len_list, fscore)[0], 
                                                    spearmanr(num_ent_list, fscore)[0]))

unbounded doc len -0.09 num ent -0.28
learned_5 doc len 0.01 num ent -0.23
learned_10 doc len -0.04 num ent -0.28
learned_20 doc len -0.02 num ent -0.35
lru_5 doc len 0.04 num ent -0.40
lru_10 doc len -0.07 num ent -0.40
lru_20 doc len -0.09 num ent -0.28


In [28]:
### Correlation with num active chains

In [29]:
for model in model_perf_per_example:
    perf_list = model_perf_per_example[model]
    _, _, fscore, example_list = zip(*perf_list)
    active_chain_list = []
    for example in example_list:
        active_chain_list.append(doc_key_to_stat[example["doc_key"]][0])

    print('{} Active chains {:.2f}'.format(model, spearmanr(active_chain_list, fscore)[0]))

unbounded Active chains -0.18
learned_5 Active chains -0.44
learned_10 Active chains -0.35
learned_20 Active chains -0.28
lru_5 Active chains -0.24
lru_10 Active chains -0.20
lru_20 Active chains -0.27


In [30]:
for k in [2, 5, 10]:
    for model in model_perf_per_example:
        perf_list = model_perf_per_example[model]
        _, _, fscore, example_list = zip(*perf_list)
        avg_topk_cluster_len_list = []
        for example in example_list:
            avg_topk_cluster_len_list.append(sum(doc_key_to_stat[example["doc_key"]][1][:k]))

        print('{} Top-{} clusters {:.2f}'.format(model, k, spearmanr(avg_topk_cluster_len_list, fscore)[0]))
    print()

unbounded Top-2 clusters 0.62
learned_5 Top-2 clusters 0.54
learned_10 Top-2 clusters 0.61
learned_20 Top-2 clusters 0.65
lru_5 Top-2 clusters 0.61
lru_10 Top-2 clusters 0.66
lru_20 Top-2 clusters 0.63

unbounded Top-5 clusters 0.54
learned_5 Top-5 clusters 0.41
learned_10 Top-5 clusters 0.50
learned_20 Top-5 clusters 0.56
lru_5 Top-5 clusters 0.50
lru_10 Top-5 clusters 0.60
lru_20 Top-5 clusters 0.55

unbounded Top-10 clusters 0.49
learned_5 Top-10 clusters 0.34
learned_10 Top-10 clusters 0.43
learned_20 Top-10 clusters 0.50
lru_5 Top-10 clusters 0.42
lru_10 Top-10 clusters 0.55
lru_20 Top-10 clusters 0.49



In [31]:
for model, logs in model_to_logs.items():
    mem_usage = []
    for log in logs:
        over_action = sum([1  for action in log["pred_actions"] if action[1]=='o' ])
    
        if 'unbounded' not in model:
            num_cells = int(model.split('_')[-1])
            over_action = min(over_action, num_cells)

        mem_usage.append(over_action) 
    
    
    print ('{}, max: {}, avg: {:.1f}'.format(model, max(mem_usage), np.mean(mem_usage)))
    
#     print(mem_usage)

unbounded, max: 198, avg: 96.9
learned_5, max: 5, avg: 5.0
learned_10, max: 10, avg: 10.0
learned_20, max: 20, avg: 20.0
lru_5, max: 5, avg: 5.0
lru_10, max: 10, avg: 10.0
lru_20, max: 20, avg: 20.0


In [32]:
for model, logs in model_to_logs.items():
    mem_usage = []
    for log in logs:
        over_action = sum([1  for action in log["pred_actions"] if action[1]=='o' ])
    
        if 'unbounded' not in model:
            num_cells = int(model.split('_')[-1])
            over_action = min(over_action, num_cells)

        mem_usage.append(over_action) 
    
    
    print ('{}, max: {}, avg: {:.1f}'.format(model, max(mem_usage), np.mean(mem_usage)))
    
#     print(mem_usage)

unbounded, max: 198, avg: 96.9
learned_5, max: 5, avg: 5.0
learned_10, max: 10, avg: 10.0
learned_20, max: 20, avg: 20.0
lru_5, max: 5, avg: 5.0
lru_10, max: 10, avg: 10.0
lru_20, max: 20, avg: 20.0
