## Preliminaries
loading stuff, defining helpful functions etc.

n.b: code repetition etc not representative, because in a more-or-less throwaway ipynb. 

In [1]:
import json


def load_jsonl(f, max_l=-1):
    with open(f) as fh:
        lines = fh.readlines()
    return [json.loads(s) for s in (lines if max_l < 0 else lines[:max_l])]

In [2]:
def get_used_vocab_size(data):
    return list(dict.fromkeys(word for d in data for word in d['message']).keys())


In [3]:
import numpy as np

data_small = load_jsonl('exp1.5-best-diag.jsonl')

seen_small = np.loadtxt('egg/zoo/basic_games/data_generation_scripts/exp1.5-train-l300-r0.25-s42.txt', dtype=int)
unseen_small = np.loadtxt('egg/zoo/basic_games/data_generation_scripts/exp1.5-eval-l100-r0.25-s42.txt', dtype=int)

for d in data_small:
    d['input'] = (d['input'][1], d['input'][0])
    if any((x, y) == d['input'] for x, y in seen_small):
        d['seen'] = True
    else:
        d['seen'] = False

used_vocab_small = get_used_vocab_size(data_small)
normalised_vocab_small = dict((u, str(i)) for i, u in enumerate(u for u in used_vocab_small if u != 0))
normalised_vocab_small[0] = '.'


In [4]:
data_large = load_jsonl('exp3-best-diag.jsonl')

seen_large = np.loadtxt('egg/zoo/basic_games/data_generation_scripts/exp3-train-l900-r0.25-s42.txt', dtype=int)
unseen_large = np.loadtxt('egg/zoo/basic_games/data_generation_scripts/exp3-eval-l300-r0.25-s42.txt', dtype=int)

for d in data_large:
    d['input'] = (d['input'][1], d['input'][0])
    if any((x, y) == d['input'] for x, y in seen_large):
        d['seen'] = True
    else:
        d['seen'] = False

used_vocab_large = get_used_vocab_size(data_large)
normalised_vocab_large = dict((u, str(i)) for i, u in enumerate(u for u in used_vocab_large if u != 0))
normalised_vocab_large[0] = '.'

In [5]:
def normalise(message, vocab):
    return ' '.join(vocab[m] for m in message)

Some helpful grouper functions

In [6]:
import tabulate
from operator import itemgetter
from itertools import groupby


def group_by_sum(data):
    key = itemgetter('label')
    grouped = groupby(sorted(data, key=key), key=key)
    grouped_by = {}
    for label, group in grouped:
        grouped_by[label] = list(group)
    return grouped_by


def group_by_summand(data):
    max_summand = max(d['input'][0] for d in data)
    grouped_by = {}
    for i in range(max_summand):
        label = i
        group = [d for d in data if label in d['input']]
        grouped_by[label] = list(group)
    return grouped_by

## Analysis

Correlation between performance on training and evaluation data.

We see moderate correlation between performance on the training/evaluation set broken down by
label (sum)
and no statistically significant correlation when grouped by numbers appearing in input (summand).

In [7]:
from scipy.stats import pearsonr


def correlation_seen_unseen(groups):
    accs_train = []
    accs_test = []
    for label, group in groups.items():
        num_seen = sum(d['seen'] for d in group)
        num_unseen = sum(not d['seen'] for d in group)
        if num_seen and num_unseen:
            accs_train.append(sum(d['correct'] for d in group if d['seen']) / num_seen)
            accs_test.append(sum(d['correct'] for d in group if not d['seen']) / num_unseen)
    return pearsonr(accs_train, accs_test)


print("Grouped by label:")
corr, p_val = correlation_seen_unseen(group_by_sum(data_small))
print(f"Correlation, small ds: {corr:.2f}, P-Value, {p_val:.2f}")
corr, p_val = correlation_seen_unseen(group_by_sum(data_large))
print(f"Correlation, large ds: {corr:.2f}, P-Value, {p_val:.2f}")

print("Grouped by input:")
corr, p_val = correlation_seen_unseen(group_by_summand(data_small))
print(f"Correlation small ds: {corr:.2f}, P-Value, {p_val:.2f}")
corr, p_val = correlation_seen_unseen(group_by_summand(data_large))
print(f"Correlation, large ds: {corr:.2f}, P-Value, {p_val:.2f}")

Grouped by label:
Correlation, small ds: 0.46, P-Value, 0.01
Correlation, large ds: 0.58, P-Value, 0.00
Grouped by input:
Correlation small ds: 0.26, P-Value, 0.27
Correlation, large ds: 0.18, P-Value, 0.27


Used vocabulary. The larger dataset uses more tokens in the vocabulary. This appears reasonable,
given the increased dataset size with the same fixed message length (5 + <eos> in both cases).

In [8]:
print(len(normalised_vocab_small))
print(len(normalised_vocab_large))

7
11


Symmetry: Investigation of behaviour on input pairs of the form  $((x,y),(y,x))$

In [9]:
def group_by_symmetry(data):
    groups = {}
    for d in data:
        x, y = d['input']
        if x != y:
            if (x, y) in groups:
                print((x, y))
                print(groups)
                raise ValueError("wat")
            if (y, x) in groups:
                groups[(y, x)].append(d)
            else:
                groups[(x, y)] = [d]
    return groups

For the small dataset: how many pairs were consistently predicted correctly/incorrectly and
how many were not predicted consistently?

In [10]:
groups_small = group_by_symmetry(data_small)
assert len(groups_small) == 190
print("Both pairs predicted correctly:",
      sum(d1['correct'] == d2['correct'] == True for _, (d1, d2) in groups_small.items()))
print("Both pairs predicted incorrectly:",
      sum(d1['correct'] == d2['correct'] == False for _, (d1, d2) in groups_small.items()))
print("pairs predicted inconsinstently:", sum(d1['correct'] != d2['correct'] for _, (d1, d2) in groups_small.items()))

Both pairs predicted correctly: 138
Both pairs predicted incorrectly: 15
pairs predicted inconsinstently: 37


For the large dataset: how many pairs were consistently predicted correctly/incorrectly and
how many were not predicted consistently?

The large dataset is different from the small dataset in the sense that it does not
contain all possible input pairs up to $n_{max}$. This means that there are input pairs which
symmetric counter-pairs are not contained in the dataset. Hence the split in `symmetrics` and
`asymmetrics`, i.e. pairs with/without counter-parts.

In [11]:
groups_large = group_by_symmetry(data_large)
large_symmetrics = {k: v for k, v in groups_large.items() if len(v) == 2}
large_asymmetrics = {k: v for k, v in groups_large.items() if len(v) == 1}
print("Both pairs predicted correctly:",
      sum(d1['correct'] == d2['correct'] == True for _, (d1, d2) in large_symmetrics.items()))
print("Both pairs predicted incorrectly:",
      sum(d1['correct'] == d2['correct'] == False for _, (d1, d2) in large_symmetrics.items()))
print("pairs predicted inconsinstently:",
      sum(d1['correct'] != d2['correct'] for _, (d1, d2) in large_symmetrics.items()))

Both pairs predicted correctly: 263
Both pairs predicted incorrectly: 46
pairs predicted inconsinstently: 124


Here, we build a three-by-three table for the symmetric pairs, containing the following
information:

Seen, unseen and 50/50 denote whether the pairs were exclusively in training/eval sets or split
between both. Similarly, Correct, wrong and 50/50 means whether both pairs were predicted
correctly, wrongly or exactly one was predicted correctly.

In [12]:
from IPython.display import HTML, display


def three_by_three_table(groups):
    table = np.zeros((3, 3))
    for label, (d1, d2) in groups.items():
        x = 0 if d1['seen'] == d2['seen'] == True else 1 if d1['seen'] == d2['seen'] == False else 2
        y = 0 if d1['correct'] == d2['correct'] == True else 1 if d1['correct'] == d2['correct'] == False else 2
        table[x, y] += 1
    return table


rows = iter(['Both seen', 'Both unseen', '50/50'])
print('For the small dataset:')
tabulate.tabulate(map(lambda x: [next(rows)] + x, three_by_three_table(groups_small).tolist()),
                  headers=["Both Correct", "Both Wrong", "50/50"],
                  tablefmt='html')

For the small dataset:


Unnamed: 0,Both Correct,Both Wrong,50/50
Both seen,99,6,8
Both unseen,2,4,8
50/50,37,5,21


In [13]:
rows = iter(['Both seen', 'Both unseen', '50/50'])
print('For the large dataset:')
tabulate.tabulate(map(lambda x: [next(rows)] + x, three_by_three_table(large_symmetrics).tolist()),
                  headers=["Both Correct", "Both Wrong", "50/50"], tablefmt='html')

For the large dataset:


Unnamed: 0,Both Correct,Both Wrong,50/50
Both seen,185,19,37
Both unseen,7,7,7
50/50,71,20,80


Same as above, but for asymmetric inputs that do not have a symmetric counter-part.
Naturally, only calculated for the large dataset.

In [14]:
def two_by_two_table(groups):
    table = np.zeros((2, 2))
    for label, [d1] in groups.items():
        x = 0 if d1['seen'] else 1
        y = 0 if d1['correct'] else 1
        table[x, y] += 1
    return table


rows = iter(['Seen', 'Unseen'])
tabulate.tabulate(map(lambda x: [next(rows)] + x, two_by_two_table(large_asymmetrics).tolist()),
                  headers=["Correct", "Wrong"], tablefmt='html')

Unnamed: 0,Correct,Wrong
Seen,179,47
Unseen,28,54


The large dataset gives us the opportunity to compare the performance on inputs where
the symmetric counterpart was observed during training and where it is not possible, because
the symmetric input was not part of the dataset.
The output of this cell describes the following contingency table:

where 50/50 means that two symmetric inputs are split between train/evaluation splits and unseen means
that an input is in the evaluation set. Correct and Wrong denotes whether the predictions for the
unseen example is correct.

In [15]:
def two_by_two_table_sym_asym(sym_groups, asym_groups, ignore_seen_wrong=False):
    fifty_fifties = [(d1, d2) for (_, (d1, d2)) in sym_groups.items() if d1['seen'] != d2['seen']]
    sym_correct = 0
    sym_wrong = 0
    asym_correct = 0
    asym_wrong = 0
    for d1, d2 in fifty_fifties:
        seen, unseen = d1 if d1['seen'] else d2, d2 if d1['seen'] else d1
        #print(seen['correct'])
        #print(unseen['correct'])
        assert seen['seen']
        assert not unseen['seen']
        if not ignore_seen_wrong:
            #print('unseen correct', int(unseen['correct']))
            sym_correct += unseen['correct']
        elif seen['correct'] and unseen['correct']:
            #print('unseen correct')
            sym_correct += 1
        if seen['correct'] and not unseen['correct']:
            #print('unseen wrong')
            sym_wrong += 1

    for label, [d1] in asym_groups.items():
        if not d1['seen']:
            asym_correct += d1['correct']
            asym_wrong += not d1['correct']
    return [[sym_correct, sym_wrong], [asym_correct, asym_wrong]]


rows = iter(['50/50 (symmetric)', 'Unseen (asymmetric)'])
tabulate.tabulate(map(lambda x: [next(rows)] + x, two_by_two_table_sym_asym(large_symmetrics, large_asymmetrics)),
                  headers=["Unseen Correct", "Unseen Wrong"], tablefmt='html')

Unnamed: 0,Unseen Correct,Unseen Wrong
50/50 (symmetric),80,71
Unseen (asymmetric),28,54


This allows us to perform Fisher's exact test to investigate whether the results are
statistically significant. We see ($p<=0.05$), that the networks have higher prediction
performance on inputs where the symmetric counter-parts were observed in training before.

In [16]:
from scipy.stats import fisher_exact

_, p_value = fisher_exact(two_by_two_table_sym_asym(large_symmetrics, large_asymmetrics, ignore_seen_wrong=False),
                          alternative='greater')
print(f"P-Value: {p_value:.3f}")

P-Value: 0.004


## Synonyms

In [17]:
def get_num_synonyms(groups, seen_only=False, size_only=True):
    synonym_groups = {}
    for label, group in groups.items():
        examples = [d for d in group if d['correct']]
        if seen_only:
            examples = [d for d in examples if d['seen']]
        if examples:
            synonym_groups[label] = set(tuple(d['message']) for d in examples)
            if size_only:
                synonym_groups[label] = len(synonym_groups[label])
    return synonym_groups

In [18]:
import math
from scipy.stats import t


def get_mean_var_ci(sample, alpha=0.025):
    sample = np.array(list(sample))
    t_ci = t.ppf(1 - alpha, df=len(sample) - 1)
    return sample.mean(), sample.var(), t_ci * sample.std() / math.sqrt(len(sample))

Average number of synonymous messages (that led to correct predictions) in the whole dataset.

In [19]:
syns_small = get_num_synonyms(group_by_sum(data_small))
syns_large = get_num_synonyms(group_by_sum(data_large))
mean, var, ci = get_mean_var_ci(syns_small.values())
print(f"avg # synonyms in small ds: {mean:.2f}  +/- {ci:.2f}")
mean, var, ci = get_mean_var_ci(syns_large.values())
print(f"avg # synonyms in large ds: {mean:.2f}  +/- {ci:.2f}")

avg # synonyms in small ds: 1.59  +/- 0.37
avg # synonyms in large ds: 1.63  +/- 0.24


Average number of synonymous messages (that led to correct predictions) in the training set only.
Interestingly the number is somewhat lower (but not statistically significant at $p=0.05$).
It's interesting, because what it means is that some messages were produced by the sender
that were not observed by the receiver during
training, but the receiver was still able to produce the correct prediction.

In [20]:
syns_small_seen = get_num_synonyms(group_by_sum(data_small), seen_only=True)
syns_large_seen = get_num_synonyms(group_by_sum(data_large), seen_only=True)
mean, var, ci = get_mean_var_ci(syns_small_seen.values())
print(f"avg # synonyms in small train data: {mean:.2f}  +/- {ci:.2f}")
mean, var, ci = get_mean_var_ci(syns_large_seen.values())
print(f"avg # synonyms in large train data: {mean:.2f}  +/- {ci:.2f}")

from scipy.stats import ttest_rel

_, p_value = ttest_rel(list(syns_small.values()), list(syns_small_seen.values()))
print("P-Value for small dataset: ", p_value)
_, p_value = ttest_rel(list(syns_large.values()), list(syns_large_seen.values()))
print("P-Value for large dataset: ", p_value)

avg # synonyms in small train data: 1.50  +/- 0.29
avg # synonyms in large train data: 1.59  +/- 0.24
P-Value for small dataset:  0.08309875128247367
P-Value for large dataset:  0.15906635012795697


There is no correlation between predictive performance and the number of synonyms when grouped
by the label (sum of inputs), for either dataset.

In [21]:
def correlation_num_synonyms_correct(groups, test_only=False):
    num_syns = []
    accs_test = []
    for label, group in groups.items():
        num_seen = sum(d['seen'] for d in group)
        num_unseen = sum(not d['seen'] for d in group)
        if num_seen and num_unseen:
            num_synonyms = get_num_synonyms({label: group}).get(label, None)
            if num_synonyms is not None:
                num_syns.append(num_synonyms)
                if test_only:
                    accs_test.append(sum(d['correct'] for d in group if not d['seen']) / num_unseen)
                else:
                    accs_test.append(sum(d['correct'] for d in group) / len(group))
    return pearsonr(num_syns, accs_test)


print(correlation_num_synonyms_correct(group_by_sum(data_small)))
print(correlation_num_synonyms_correct(group_by_sum(data_large)))

(0.10766526492862784, 0.5782712044403318)
(0.1076507466077131, 0.42969492445549634)


## Misc

There is a moderate correlation between the number of training pairs for a sum and the capability
to learn that sum, for the large dataset this correlation persists also for inputs unseen during
training.

In [22]:
def correlation_by_train_size(groups, test_only=False):
    train_set_sizes = []
    accs_test = []
    for label, group in groups.items():
        num_seen = sum(d['seen'] for d in group)
        num_unseen = sum(not d['seen'] for d in group)
        if num_seen and (num_unseen or not test_only):
            train_set_sizes.append(num_seen)
            if test_only:
                accs_test.append(sum(d['correct'] for d in group if not d['seen']) / num_unseen)
            else:
                accs_test.append(sum(d['correct'] for d in group) / len(group))
    return pearsonr(train_set_sizes, accs_test)


corr, p_val = correlation_by_train_size(group_by_sum(data_small))
print(f"Correlation, small ds: {corr:.2f}, P-Value, {p_val:.3f}")

corr, p_val = correlation_by_train_size(group_by_sum(data_large))
print(f"Correlation, big ds: {corr:.2f}, P-Value, {p_val:.3f}")

Correlation, small ds: 0.52, P-Value, 0.001
Correlation, big ds: 0.70, P-Value, 0.000


In [23]:
corr, p_val = correlation_by_train_size(group_by_sum(data_small), test_only=True)
print(f"Correlation, small ds: {corr:.2f}, P-Value, {p_val:.3f}")

corr, p_val = correlation_by_train_size(group_by_sum(data_large), test_only=True)
print(f"Correlation, big ds: {corr:.2f}, P-Value, {p_val:.3f}")

Correlation, small ds: 0.24, P-Value, 0.177
Correlation, big ds: 0.46, P-Value, 0.000


The average edit distance between synonymous messages (that led to correct predictions) is around 2,
which corresponds to e.g. flipping `[a, b]` to `[b, a]`. This largely corresponds to anecdotal
observations (see end of notebook).


In [24]:
import textdistance
import itertools


def get_avg_edit_distance_synonyms(groups):
    distances = []
    for label, synonyms in groups.items():
        for x, y in ((m1, m2) for m1, m2 in itertools.product(synonyms, repeat=2) if m1 != m2):
            distances.append(textdistance.levenshtein.distance(x, y))
    return distances


synonyms_small = get_num_synonyms(group_by_sum(data_small), size_only=False)
mean, var, ci = get_mean_var_ci(get_avg_edit_distance_synonyms(synonyms_small))
print(f"avg edit distance for synonyms, small ds: {mean:.2f}  +/- {ci:.2f}")
mean, var, ci = get_mean_var_ci(
    get_avg_edit_distance_synonyms(get_num_synonyms(group_by_sum(data_large), size_only=False)))
print(f"avg edit distance for synonyms, large ds: {mean:.2f}  +/- {ci:.2f}")

avg edit distance for synonyms, small ds: 2.47  +/- 0.36
avg edit distance for synonyms, large ds: 2.02  +/- 0.18


Average distance between messages of next higher sum.

In [49]:
def get_distances(syn_groups):
    #print(syn_groups)
    keys = sorted(list(syn_groups.keys()))
    #print(keys)
    results = []
    for g1, g2 in ((syn_groups.get(k,[]), syn_groups.get(k+1,[])) for k in keys):
        try:
            min_distance = min(textdistance.levenshtein.distance(m1, m2) for m1 in g1 for m2 in g2)
            results.append(min_distance)
        except ValueError:
            pass
    return results


distances_small = get_distances(get_num_synonyms(group_by_sum(data_small), False, size_only=False))
mean, var, ci = get_mean_var_ci(distances_small)
print(f"avg minimum edit distance between messages of two sums differing in at most 1, small ds: {mean:.2f}  +/- {ci:.2f}")
distances_large = get_distances(get_num_synonyms(group_by_sum(data_large), False, size_only=False))
mean, var, ci = get_mean_var_ci(distances_large)
print(f"avg minimum edit distance between messages of two sums differing in at most 1, small ds: {mean:.2f}  +/- {ci:.2f}")

avg minimum edit distance between messages of two sums differing in at most 1, small ds: 1.50  +/- 0.26
avg minimum edit distance between messages of two sums differing in at most 1, small ds: 1.42  +/- 0.20


The average distance between the expected label and the predicted label is 1 for those examples
that were not predicted correctly.

In [26]:
def get_errors(groups, test_only=False):
    errors = []
    for label, group in groups.items():
        if test_only:
            group = [d for d in group if not d['seen']]
        errors.extend(abs(label - d['output']) for d in group if not d['correct'])
    return errors


mean, var, ci = get_mean_var_ci(get_errors(group_by_sum(data_small)))
print(f"avg error: {mean:.2f}  +/- {ci:.2f}")
mean, var, ci = get_mean_var_ci(get_errors(group_by_sum(data_large)))
print(f"avg error: {mean:.2f}  +/- {ci:.2f}")

avg error: 1.07  +/- 0.06
avg error: 1.21  +/- 0.06


## Visualisation

In [27]:

def inspect_by_sum(data, vocab):
    for label, group in group_by_sum(data).items():
        seen = [d for d in group if d['seen']]
        unseen = [d for d in group if not d['seen']]
        print(f"{label}: {len(group)} examples")
        print(f"seen: {sum(d['correct'] for d in seen)}/{len(seen)}")
        print(
            tabulate.tabulate([(s['input'], normalise(s['message'], vocab), s['correct'], s['output']) for s in seen]))
        print(f"unseen: {sum(d['correct'] for d in unseen)}/{len(unseen)}")
        #print(f"unseen: {sum(d['correct'] for d in unseen)}/{len(unseen)}")
        print(tabulate.tabulate(
            [(s['input'], normalise(s['message'], vocab), s['correct'], s['output']) for s in unseen]))
        print("----" * 20)

In [28]:
def inspect_by_summand(data, vocab):
    for label, group in group_by_summand(data).items():
        seen = [d for d in group if d['seen']]
        unseen = [d for d in group if not d['seen']]
        print(f"{label}: {len(group)} examples")
        print(f"seen: {sum(d['correct'] for d in seen)}/{len(seen)}")
        print(
            tabulate.tabulate([(s['input'], normalise(s['message'], vocab), s['correct'], s['output']) for s in seen]))
        print(f"unseen: {sum(d['correct'] for d in unseen)}/{len(unseen)}")
        #print(f"unseen: {sum(d['correct'] for d in unseen)}/{len(unseen)}")
        print(tabulate.tabulate(
            [(s['input'], normalise(s['message'], vocab), s['correct'], s['output']) for s in unseen]))
        print("----" * 20)


Grouping and visualising the datasets by label (sum).

In [29]:
print("Small ds")
inspect_by_sum(data_small, normalised_vocab_small)

Small ds
0: 1 examples
seen: 0/1
------  -----------  -----  -
(0, 0)  0 0 0 0 0 .  False  1
------  -----------  -----  -
unseen: 0/0

--------------------------------------------------------------------------------
1: 2 examples
seen: 2/2
------  -----------  ----  -
(0, 1)  0 0 0 0 0 .  True  1
(1, 0)  0 0 0 0 0 .  True  1
------  -----------  ----  -
unseen: 0/0

--------------------------------------------------------------------------------
2: 3 examples
seen: 0/1
------  -----------  -----  -
(2, 0)  0 0 0 0 0 .  False  1
------  -----------  -----  -
unseen: 0/2
------  -----------  -----  -
(0, 2)  0 0 0 0 0 .  False  1
(1, 1)  0 0 0 0 1 .  False  3
------  -----------  -----  -
--------------------------------------------------------------------------------
3: 4 examples
seen: 4/4
------  -----------  ----  -
(0, 3)  0 0 0 0 1 .  True  3
(1, 2)  0 0 0 0 1 .  True  3
(2, 1)  0 0 0 0 1 .  True  3
(3, 0)  0 0 0 0 1 .  True  3
------  -----------  ----  -
unseen: 0/0

-----------

In [30]:
print("Large ds")
inspect_by_sum(data_large, normalised_vocab_large)

Large ds
0: 1 examples
seen: 0/1
------  -----------  -----  -
(0, 0)  0 . 0 0 . .  False  2
------  -----------  -----  -
unseen: 0/0

--------------------------------------------------------------------------------
1: 2 examples
seen: 0/1
------  -----------  -----  -
(0, 1)  0 . 0 0 . .  False  2
------  -----------  -----  -
unseen: 0/1
------  -----------  -----  -
(1, 0)  0 . 0 0 . .  False  2
------  -----------  -----  -
--------------------------------------------------------------------------------
2: 3 examples
seen: 2/2
------  -----------  ----  -
(0, 2)  0 . 0 0 . .  True  2
(1, 1)  0 . 0 0 . .  True  2
------  -----------  ----  -
unseen: 0/1
------  -----------  -----  -
(2, 0)  0 0 . 0 0 .  False  7
------  -----------  -----  -
--------------------------------------------------------------------------------
3: 3 examples
seen: 0/1
------  -----------  -----  -
(1, 2)  0 . 0 0 . .  False  2
------  -----------  -----  -
unseen: 0/2
------  -----------  -----  -
(0, 3) 

Grouping and visualising the datasets by summand appearing in input.

In [31]:
print("Small ds")
inspect_by_summand(data_small, normalised_vocab_small)

Small ds
0: 39 examples
seen: 23/30
-------  -----------  -----  --
(0, 0)   0 0 0 0 0 .  False   1
(0, 1)   0 0 0 0 0 .  True    1
(0, 10)  0 1 0 0 1 .  True   10
(0, 11)  0 1 0 1 0 .  True   11
(0, 12)  0 1 0 1 1 .  True   12
(0, 13)  1 0 1 0 0 .  False  15
(0, 14)  1 0 1 1 0 .  False  15
(0, 15)  1 0 1 0 0 .  True   15
(0, 17)  1 1 0 1 0 .  True   17
(0, 19)  1 1 1 2 3 .  True   19
(0, 3)   0 0 0 0 1 .  True    3
(0, 4)   0 0 0 0 1 .  False   3
(0, 5)   0 0 0 1 0 .  True    5
(0, 7)   0 0 1 0 0 .  True    7
(0, 8)   0 0 1 0 1 .  True    8
(1, 0)   0 0 0 0 0 .  True    1
(2, 0)   0 0 0 0 0 .  False   1
(3, 0)   0 0 0 0 1 .  True    3
(4, 0)   0 0 0 0 1 .  False   3
(5, 0)   0 0 0 1 0 .  True    5
(7, 0)   0 0 1 0 0 .  True    7
(8, 0)   0 0 1 0 1 .  True    8
(9, 0)   0 0 1 1 0 .  True    9
(11, 0)  0 1 0 1 0 .  True   11
(13, 0)  0 1 1 0 0 .  True   13
(14, 0)  0 1 1 1 0 .  True   14
(16, 0)  1 0 1 1 0 .  False  15
(17, 0)  1 1 0 1 0 .  True   17
(18, 0)  1 1 1 0 1 .  True   18
(19,

In [32]:
print("Large ds")
inspect_by_summand(data_large, normalised_vocab_large)

Large ds
0: 58 examples
seen: 34/43
-------  -----------  -----  --
(0, 0)   0 . 0 0 . .  False   2
(0, 1)   0 . 0 0 . .  False   2
(0, 10)  1 0 0 0 0 .  False  12
(0, 12)  1 0 0 0 0 .  True   12
(0, 13)  1 1 0 0 0 .  True   13
(0, 14)  1 1 1 1 1 .  True   14
(0, 15)  1 1 1 1 1 .  False  14
(0, 17)  2 1 0 0 0 .  True   17
(0, 18)  2 1 1 0 0 .  True   18
(0, 19)  2 1 1 1 0 .  True   19
(0, 2)   0 . 0 0 . .  True    2
(0, 22)  2 1 1 3 1 .  True   22
(0, 24)  2 1 3 4 0 .  True   24
(0, 26)  2 3 4 0 0 .  True   26
(0, 28)  2 3 4 1 1 .  True   28
(0, 29)  2 3 4 2 1 .  True   29
(0, 30)  2 3 4 2 4 .  True   30
(0, 32)  2 3 4 3 4 .  True   32
(0, 34)  2 3 5 6 3 .  True   34
(0, 35)  2 3 5 6 5 .  True   35
(0, 37)  2 3 5 7 5 .  True   37
(0, 38)  2 3 5 7 7 .  True   38
(0, 5)   0 0 . 0 0 .  False   7
(0, 7)   0 0 . 0 0 .  True    7
(0, 9)   0 0 . 0 0 .  False   7
(4, 0)   0 0 . 0 0 .  False   7
(6, 0)   0 0 . 0 0 .  False   7
(14, 0)  1 1 1 1 1 .  True   14
(15, 0)  1 1 1 1 1 .  False  14
(16,