In [1]:
import numpy as np
import re
from scipy import stats
from collections import defaultdict
from examine import parse_bleu, parse_examples, compute_bleu
from functools import partial
from mlrunner.examine import Examiner, latest_log


def prepare_examples(command, path, experiment):
    key = "{}-examples".format(command)
    if key in experiment.cache:
        examples = experiment.cache[key]
    else:
        log = latest_log(command, path)
        if log is None:
            return None
        examples = parse_examples(log)
        if not examples:
            return None
        experiment.cache["{}-examples".format(command)] = examples
    return examples


def get_bleu(command, path, experiment, caches):
    examples = prepare_examples(command, path, experiment)
    if examples is None:
        return
    try:
        preds = [e["D"] for e in examples]
        refs = [e["T"] if "T" in e else " ".join(e["C"]) for e in examples]
        bleu = compute_bleu(preds, refs)
        experiment.metric[command] = bleu
    except:
        pass


def get_test_bleu(command, path, experiment, caches):
    log = latest_log(command, path)
    if log is None:
        return None
    bleu = parse_bleu(log)
    if bleu is not None:
        experiment.metric[command] = bleu



# Permutation sensitivity: constrained (Table 2)

In [5]:
# Rand
# with data augmentation: baseX -> augX
# shuffle subwords: sbase -> shuf
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*rand.*base.*")
results = defaultdict(list)
for k, exp in examiner.experiments.items():
    key = re.findall("(base\d|sbase)", k)
    if key and "testc" in exp.metric:
        results[key[0]].append(exp.metric["testc"])
for key, value in sorted(results.items()):
    value = np.array(value)
    print("{}\t{:.2f}\t{:.3f}\t".format(key, round(value.mean(), 2), round(value.std(), 3)))

base0	40.45	0.133	
base2	40.86	0.159	
base4	40.73	0.088	
base6	40.73	0.157	
base8	40.94	0.104	
sbase	39.58	0.135	



In [7]:
# no input position embedding
# coresponding to npos
examiner = Examiner()
examiner.add(partial(get_bleu, "testpc"))
examiner.exam(output="output_decode/", regex=".*rand.*base0.*")
results = defaultdict(list)
for k, exp in examiner.experiments.items():
    key = re.findall("base0", k)
    if key and "testpc" in exp.metric:
        results[key[0]].append(exp.metric["testpc"])
for key, value in results.items():
    value = np.array(value)
    print("{}\t{:.2f}\t{:.3f}\t".format(key, round(value.mean(), 2), round(value.std(), 3)))

base0	40.04	0.050	


In [8]:
# Bart
# with data augmentation: baseX -> augX
# shuffle subwords: sbase -> shuf
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*bart.*base.*")
results = defaultdict(list)
for k, exp in examiner.experiments.items():
    key = re.findall("(base\d|sbase)", k)
    if key and "testc" in exp.metric:
        results[key[0]].append(exp.metric["testc"])
for key, value in sorted(results.items()):
    value = np.array(value)
    print("{}\t{:.2f}\t{:.3f}\t".format(key, round(value.mean(), 2), round(value.std(), 3)))


base0	56.21	0.185	
base2	56.78	0.110	
base4	56.97	0.164	
base6	56.76	0.180	
base8	56.91	0.155	
sbase	56.45	0.133	


# Permutation sensitivity: unconstrained (Table 6)

In [16]:
# Rand
# with data augmentation: baseX -> augX
# shuffle subwords: sbase -> shuf
examiner = Examiner()
examiner.add(partial(get_test_bleu, "test"))
examiner.exam(output="output_decode/", regex=".*rand.*base.*")
results = defaultdict(list)
for k, exp in examiner.experiments.items():
    key = re.findall("(base\d|sbase)", k)
    if key and "test" in exp.metric:
        results[key[0]].append(exp.metric["test"])
for key, value in sorted(results.items()):
    value = np.array(value)
    print("{}\t{:.2f}\t{:.3f}\t".format(key, round(value.mean(), 2), round(value.std(), 3)))


base0	39.39	0.151	
base2	39.50	0.118	
base4	40.08	0.091	
base6	39.73	0.053	
base8	40.02	0.152	
sbase	36.52	0.186	


In [15]:
# no input position embedding
# coresponding to npos
examiner = Examiner()
examiner.add(partial(get_test_bleu, "testp"))
examiner.exam(output="output_decode/", regex=".*rand.*base0.*")
results = defaultdict(list)
for k, exp in examiner.experiments.items():
    key = re.findall("base0", k)
    if key and "testp" in exp.metric:
        results[key[0]].append(exp.metric["testp"])
for key, value in sorted(results.items()):
    value = np.array(value)
    print("{}\t{:.2f}\t{:.3f}\t".format(key, round(value.mean(), 2), round(value.std(), 3)))


base0	36.98	0.018	


In [17]:
# bart
examiner = Examiner()
examiner.add(partial(get_test_bleu, "test"))
examiner.exam(output="output_decode/", regex=".*bart.*base")
results = defaultdict(list)
for k, exp in examiner.experiments.items():
    key = re.findall("(base\d|sbase)", k)
    if key and "test" in exp.metric:
        results[key[0]].append(exp.metric["test"])
for key, value in sorted(results.items()):
    value = np.array(value)
    print("{}\t{:.2f}\t{:.3f}\t".format(key, round(value.mean(), 2), round(value.std(), 3)))

# Inject different parts of a dependency tree: Table 3 & Table 4

In [19]:
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*64.*Split_valid")
examiner.table()

Unnamed: 0,data,max-tokens,mode,testc
0,bart,4096,pos,57.63
1,bart,4096,base,56.14
2,rand,1024,all,90.3
3,rand,1024,ldep,90.27
4,bart,4096,udep,90.51
5,bart,4096,brac,56.64
6,rand,1024,base,40.36
7,rand,1024,brac,40.58
8,rand,1024,pos,42.4
9,bart,4096,all,91.43


# Conditional v.s. unconditional modeling: Figure 4

In [20]:
# unconditional
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*Mode_ubase-.*")
examiner.table()

Unnamed: 0,beam-size,data,max-tokens,testc
0,32,bart,4096,42.85
1,128,bart,1024,47.31
2,512,bart,256,51.08
3,64,bart,2048,45.33
4,32,rand,2048,36.3
5,128,rand,2048,39.25
6,256,rand,1536,40.22
7,1024,rand,1024,42.02
8,2048,bart,1024,54.05
9,4096,rand,1024,


In [21]:
# conditional
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*Mode_base-.*")
examiner.table()

Unnamed: 0,beam-size,data,max-tokens,split,testc
0,64,bart,4096,valid,56.14
1,2048,bart,1024,test,
2,5,bart,4096,test,54.7
3,4096,bart,1024,test,
4,10,bart,4096,test,55.59
5,128,rand,2048,test,39.58
6,32,rand,2048,test,39.29
7,256,bart,512,test,56.68
8,1024,bart,1024,test,56.75
9,256,rand,1536,test,39.61


# Partial tree linearization: Table 5

In [22]:
# base
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*Mode_(part|bnp)-.*")
examiner.table()

Unnamed: 0,data,mode,split,testc
0,rand,part,test5,80.09
1,rand,part,test,42.0
2,rand,part,test8,80.5
3,rand,bnp,test,58.77
4,bart,part,test5,90.84
5,bart,part,test6,57.17
6,rand,bnp,test1,71.95
7,bart,part,test7,73.64
8,rand,part,test1,55.23
9,rand,bnp,test4,72.62


# Graphs

In [30]:
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from multiset import Multiset
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]
markers = [0, 4, 18, 2]
output = "plots"
os.makedirs(output, exist_ok=True)

# Errors of unconstrained decoding: Figure 3

In [25]:


def compute_extra_miss(preds, refs):
    extra = 0
    missed = 0
    all_gen = 0
    all_ref = 0
    for p, r in zip(preds, refs):
        p_set = Multiset(p.split())
        r_set = Multiset(r.split())
        all_gen += len(p_set)
        extra += len(p_set - r_set)
        all_ref += len(r_set)
        missed += len(r_set - p_set)
    return extra/all_ref, missed/all_ref

def compute_len_ratio(preds, refs):
    count = 0
    sum_len_ratio = 0
    for p, r in zip(preds, refs):
        sum_len_ratio += len(p.split()) / len(r.split())
        count += 1
    return sum_len_ratio / count

def get_l2metrics(examples):
    l2es = defaultdict(list)
    for e in examples:
        l = len(e['C']) if 'C' in e else len(e["T"].split())
        l2es[min(l//5, 10)].append(e)
    l2extra = dict()
    l2miss = dict()
    l2ratio = dict()
    for l, es in l2es.items():
        preds = [e["D"] for e in es]
        refs = [e["T"] if "T" in e else " ".join(e["C"]) for e in es]
        l2extra[l], l2miss[l] = compute_extra_miss(preds, refs)
        l2ratio[l] = compute_len_ratio(preds, refs)
    keys = sorted(l2extra.keys())
    return [l2extra[k] for k in keys], [l2miss[k] for k in keys], [l2ratio[k] for k in keys]



In [28]:
examiner = Examiner()
examiner.add(partial(get_bleu, "test"))
examiner.exam(output="output_decode/", regex=".*_base-Beam_5.*")
examiner.table()

Unnamed: 0,beam-size,data,max-tokens,test
0,5,bart,4096,54.29
1,5,rand,2048,38.53
2,512,rand,1024,39.04
3,512,bart,256,54.84


In [32]:


key = "Data_{}-Mode_base-Beam_{}-Split_test"
x = [' 1-5 ', ' 6-10', '11-15', '16-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '50-  ']
for model in ["rand", "bart"]:
    for beam in [5, 512]:
        fig = make_subplots(specs=[[{"secondary_y": True}]])
        examples = examiner.experiments[key.format(model, beam)].cache['test-examples']
        preds = [e["D"] for e in examples]
        refs = [e["T"] if "T" in e else " ".join(e["C"]) for e in examples]
        ratio = compute_len_ratio(preds, refs)
        extra, miss = compute_extra_miss(preds, refs)
        print("{}, {}: miss {}\%, extra {}\%, ratio {}.".format(model, beam, round(miss*100, 2), round(extra*100, 2), round(ratio, 3)) )
        extras, misses, ratio = get_l2metrics(examples)

        fig.add_trace(go.Scatter(x=x, y=misses, name="missing".format(beam), line=dict(color=colors[0]), marker_symbol=markers[1], marker_size=8))
        fig.add_trace(go.Scatter(x=x, y=extras, name="redundant".format(beam), line=dict(color=colors[1], dash='dot'), marker_symbol=markers[1], marker_size=8))
        fig.add_trace(go.Scatter(x=x, y=ratio, name="length ratio".format(beam), line=dict(color=colors[2], dash='dash'), marker_symbol=markers[1], marker_size=8, yaxis="y2"))

        fig.update_layout(font_size=14)
        fig.update_layout(xaxis=dict(title=dict(text='Output length', standoff=3, font_family="Serif"), showgrid=True, showline=True, tickfont=dict(size=13, family="Serif")),
                          yaxis=dict(title=dict(text='Error rate', standoff=3, font_family="Serif"), tickfont=dict(family="Serif"), tickformat=".2f", showgrid=True, showline=True, range=[0, 0.14]),
                          yaxis2=dict(title=dict(text='Length Ratio', standoff=3, font_family="Serif"), tickfont=dict(family="Serif"), tickformat=".2f", showgrid=True, showline=True, range=[0.90, 1.04], anchor="x", overlaying="y", side="right")
                          )

        fig.update_layout(legend=dict(yanchor="top", xanchor="left", y=1, x=0.03, bgcolor='rgba(0,0,0,0)'))
        fig.update_layout(template="simple_white", width=600, height=400, margin=dict(l=5, r=5, t=5, b=5))
        fig.show()
        # fig.write_image(os.path.join(output, "unconstrained_{}_beam{}.pdf".format(model, beam)))

rand, 5: miss 5.74\%, extra 3.3\%, ratio 0.981.


rand, 512: miss 5.42\%, extra 3.02\%, ratio 0.982.


bart, 5: miss 5.3\%, extra 4.16\%, ratio 0.99.


bart, 512: miss 4.89\%, extra 3.85\%, ratio 0.991.


# Simulate unconditional modeling: Figure 4

In [38]:
examiner = Examiner()
examiner.add(partial(get_bleu, "testc"))
examiner.exam(output="output_decode/", regex=".*Mode_[u]?base-Beam.*test")
df = examiner.table()
df = df.sort_values(["data", "mode", "beam-size"])
df

Unnamed: 0,beam-size,data,max-tokens,mode,testc
4,5,bart,4096,base,54.7
6,10,bart,4096,base,55.59
25,32,bart,4096,base,56.33
37,64,bart,2048,base,56.38
33,128,bart,1024,base,56.63
16,256,bart,512,base,56.68
38,512,bart,256,base,56.63
18,1024,bart,1024,base,56.75
3,2048,bart,1024,base,
5,4096,bart,1024,base,


In [46]:
results = [tuple(e) for e in df.iloc]
fig = go.Figure()
mapping = {
    "base": "cond",
    "ubase": "uncond"
}
for mode, marker in zip(['base', 'ubase'], [markers[1], markers[3]]):
    for model, dash, color  in zip(['bart', 'rand'], [None, "dot"], colors):
        x = [r[0] for r in results if r[1] == model and r[3] == mode]
        y = [r[-1] for r in results if r[1] == model and r[3] == mode]
        fig.add_trace(go.Scatter(x=x[:8], y=y[:8], name="{}-{}".format(model.upper(), mapping[mode]), line=dict(color=color, dash=dash), marker_symbol=marker, marker_size=8))
fig.update_layout(font_size=14)
fig.update_layout(xaxis=dict(title=dict(text='Beam size', standoff=3, font_family="Serif"), type="log", showgrid=True, showline=True,
                             tickfont=dict(family="Serif"), tickvals=x),
                  yaxis=dict(title=dict(text='BLEU', standoff=3, font_family="Serif"), tickfont=dict(family="Serif"), tickformat=".0f", showgrid=True, showline=True,
                             range=[25, 65]
                             ),
                  )
fig.update_layout(legend=dict(yanchor="top", xanchor="left", y=1, x=0.03, bgcolor='rgba(0,0,0,0)',orientation="h"))
fig.update_layout(template="simple_white", width=600, height=400, margin=dict(l=5, r=5, t=5, b=5))
fig.show()
# fig.write_image(os.path.join(output, "unconditional.pdf"))
