# Analysis code

Code for stats + figures for paper.

Run this notebook from the `scripts` folder. Prior to doing so, ensure all
experiment artifacts have been downloaded into the `exp` folder. It should look
like

``` text
exp/
  bestrq.csa2/
    version_0101/
  bestrq.csa4/
  ...
```

Figures are saved in the `resources` folder, not inline.

## Preamble (always run)

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload

# imports
import pandas as pd
import seaborn as sns
import seaborn.objects as so
import pingouin as pg

from matplotlib import pyplot as plt
from pathlib import Path

sns.set_theme('paper', font_scale=2)

plt.ioff()

from exp_utils import *

WIDTHS = (2, 4, 8, 16, 32, 64, 128)
STEPS = (1, 3, 6, 12, 24)

## ZeroSpeech ABX-LS analysis

In [2]:
# load zrc data
dfz = collate_data()
dfz.head(10)

Unnamed: 0,zrc.subset,zrc.speaker_mode,zrc.context_mode,zrc.granularity,zrc.score,zrc.pooling,zrc.seed,zrc.pca_style,name,feat_type,...,training.best_rq_loss.mask_prob,training.best_rq_loss.mask_width,training.best_rq_loss.codebook_size,training.best_rq_loss.codebook_dim,training.best_rq_loss.offset,training.best_rq_loss.speaker_regex,training.best_rq_loss.prediction_type,training.shuffle,training.max_epochs,training.early_stopping_patience
0,dev-clean,within,within,triphone,0.1784,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
1,dev-clean,across,within,triphone,0.3216,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
2,dev-other,within,within,triphone,0.2102,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
3,dev-other,across,within,triphone,0.3732,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
4,test-clean,within,within,triphone,0.1809,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
5,test-clean,across,within,triphone,0.3077,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
6,test-other,within,within,triphone,0.2259,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
7,test-other,across,within,triphone,0.3804,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
8,dev-clean,within,within,phoneme,0.1268,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
9,dev-clean,across,within,phoneme,0.2534,none,3459,full,bestrq.csa128/version_0101,fbank-80,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10


In [3]:
# plot scores across subsets and context windows w/ phoneme granularity, within-speaker

fig = plt.figure(figsize=[14, 4.8])
df = filter_data_equal(dfz, {
    "zrc.pca_style": "full",
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.cpc_loss.prediction_steps": 12,
    "training.cpc_loss.gutted_steps": 0,
    "training.cpc_loss.negative_samples": 128,
    "training.loss_type": "cpc",
    "zrc.granularity": "phoneme",
    'training.cpc_loss.averaging_penalty': 0,
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "training.max_epochs": 200,
    "train_part": "100",
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
)
check_data(
    df,
    "csa.max_width", "zrc.subset", "zrc.score",
    "zrc.context_mode", "zrc.speaker_mode",
)

# 5 seeds * 7 context widths * 2 context modes * 2 speaker modes * 4 partitions
assert len(df) == 5 * 7 * 2 * 2 * 4

df['zrc.context_mode'] = df['zrc.context_mode'].map({'within': 'within', 'any': 'without'})

# df = df[['name', 'csa.max_width', 'zrc.subset', 'zrc.context_mode', 'zrc.speaker_mode', 'zrc.score']]

df["zrc.speaker_mode"] = df['zrc.speaker_mode'].map({
    'within': "within speaker",
    "across": "across speaker"
})

# df['zrc.context_mode'] = df['zrc.context_mode'].map({
#     'within': 'within phoneme',
#     'any': 'without phoneme'
# })

# add dev vs test distinction to dodge on graph
df['dvt'] = df['zrc.subset'].map({
        'test-clean': 'test-*',
        'test-other': 'test-*',
        'dev-clean':  'dev-*',
        'dev-other':  'dev-*'
})

# clean vs other
df['cvo'] = df['zrc.subset'].map({
        'test-clean': '*-clean',
        'test-other': '*-other',
        'dev-clean':  '*-clean',
        'dev-other':  '*-other'
})

# offset the dev/test distinction ever-so-slightly to reduce overplotting
df.loc[df['dvt'] == 'dev-*', 'csa.max_width'] *= 0.95
df.loc[df['dvt'] == 'test-*', 'csa.max_width'] *= 1.05

# ensure each point in a vertical slice gets its own error bar
df['sbyc'] = df.agg("{0[zrc.subset]}-{0[zrc.context_mode]}".format, axis=1)

plot = (
    so.Plot(df, x="csa.max_width", y="zrc.score", color="zrc.context_mode", marker="cvo", fill='dvt', group='sbyc')
    .facet("zrc.speaker_mode")
    .limit(y=(0.05, 0.26))
    .add(so.Dot(pointsize=7), so.Agg())
    .add(so.Range(), so.Est(errorbar="se"), legend=False)  # 1 standard error
    # .add(so.Line(marker=None), so.PolyFit())  # misleading?
    .scale(
        x=(
            so.Continuous(trans="log")
            .tick(at=[2, 4, 8, 16, 32, 64, 128])
            .label(like='d', base=2)
        ),
        y=(
            so.Continuous()
            .tick(at=[0.05, 0.1, 0.15, 0.2, 0.25])
            .label(like='.0%')
        ),
    )
    .label(x="width", y="ABX error rate")
    .on(fig)
)

plotter = plot.plot()

# for ax in fig.axes:
#     box = ax.get_position()
#     ax.set_position([box.x0 - box.width * 0.15, box.y0 + box.height * 0.13,
#                     1.15 * box.width, box.height * 0.96])

legend_contents = plotter._legend_contents
fig.legends.pop(0)
handles = []
labels = []
for i, legend_content in enumerate(legend_contents):
    if i == 0:
        loc, x = 'upper left', 0.15
    elif i == 1:
        loc, x = 'upper center', 0.52
    else:
        loc, x = 'upper right', 0.9
    fig.legend(
        legend_content[1], legend_content[2],
        ncol=2, fontsize='x-small', loc='upper center', columnspacing=1,
        handletextpad=0.1, bbox_to_anchor=(x, 0.25))

fig.tight_layout()
fig.savefig('../resources/libriabx_context_subset_phoneme.pdf')
plt.close(fig)

In [4]:
# determine significance of context windows

df = filter_data_equal(dfz, {
    "zrc.pca_style": "full",
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.cpc_loss.prediction_steps": 12,
    "training.cpc_loss.gutted_steps": 0,
    "training.cpc_loss.negative_samples": 128,
    "training.loss_type": "cpc",
    "zrc.granularity": "phoneme",
    'training.cpc_loss.averaging_penalty': 0,
    "training.max_epochs": 200,
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    # "zrc.speaker_mode": "within",
    "train_part": '100',
    # "zrc.context_mode": "within",
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
)
check_data(
    df,
    "csa.max_width", "zrc.subset", "zrc.score", "zrc.speaker_mode", "zrc.context_mode"
)

# 5 seeds * 7 context widths * 4 partitions
assert len(df) == 5 * 7 * 4 * 4

df = df[['name', 'csa.max_width', 'zrc.score']].groupby('name', as_index=False, observed=True, sort=True).mean(numeric_only=True)
df['csa.max_width'] = df['csa.max_width'].astype('int').astype('category')

pg.print_table(pg.anova(data=df, dv='zrc.score', between='csa.max_width'))
pg.print_table(pg.normality(data=df, dv='zrc.score', group='csa.max_width'))
pg.print_table(pg.homoscedasticity(data=df, dv='zrc.score', group='csa.max_width'))

pg.print_table(pg.pairwise_tests(data=df, dv='zrc.score', between='csa.max_width', parametric=False, return_desc=True))



ANOVA SUMMARY

Source           ddof1    ddof2      F    p-unc    np2
-------------  -------  -------  -----  -------  -----
csa.max_width        6       28  6.026    0.000  0.564

    W    pval  normal
-----  ------  --------
0.874   0.282  True
0.932   0.609  True
0.832   0.145  True
0.862   0.236  True
0.954   0.765  True
0.874   0.284  True
0.969   0.869  True

    W    pval  equal_var
-----  ------  -----------
0.826   0.560  True


POST HOC TESTS

Contrast         A    B    mean(A)    std(A)    mean(B)    std(B)  Paired    Parametric      U-val  alternative      p-unc    hedges
-------------  ---  ---  ---------  --------  ---------  --------  --------  ------------  -------  -------------  -------  --------
csa.max_width    2    4      0.156     0.003      0.142     0.013  False     False          22.000  two-sided        0.056     1.338
csa.max_width    2    8      0.156     0.003      0.151     0.012  False     False          14.000  two-sided        0.841     0.495
csa.max_w

In [5]:
# plot libriABX score of context width vs prediction steps (averaged)
# - this one will show up on the left of our figure, so it doesn't have the cbar
#   but does have the y axis label
fig, axs = plt.subplots(1, 2, figsize=(14, 4.8), sharey=True)

df = filter_data_equal(dfz, {
    "zrc.pca_style": "full",
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.loss_type": "cpc",
    "training.cpc_loss.gutted_steps": 0,
    'training.cpc_loss.averaging_penalty': 0,
    "zrc.granularity": "phoneme",
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    # "zrc.speaker_mode": "within",
    # "zrc.context_mode": "within",
    "training.max_epochs": 200,
    "train_part": "100",
})
df = filter_data_in(
    df, {
        "csa.max_width": WIDTHS,
        "training.cpc_loss.prediction_steps": STEPS,
    }
)
check_data(
    df,
    "csa.max_width", "zrc.subset", "zrc.score",
    "zrc.context_mode", "zrc.speaker_mode",
    "training.cpc_loss.prediction_steps"
)
df = df.pivot_table(
    columns="csa.max_width",
    index="training.cpc_loss.prediction_steps",
    values="zrc.score"
).sort_values(by='training.cpc_loss.prediction_steps', ascending=False)
plot = sns.heatmap(df, annot=True, fmt=".0%", vmin=0.1, vmax=0.5, cbar=False, square=True, ax=axs[0])
plot.set(xlabel='context width', ylabel='prediction steps', title='averaged')
plot.set_xticklabels([f'{int(x):d}' for x in sorted(df.columns)])

df = filter_data_equal(dfz, {
    "zrc.pca_style": "full",
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.loss_type": "cpc",
    'training.cpc_loss.averaging_penalty': 0,
    "training.cpc_loss.negative_samples": 128,
    "zrc.granularity": "phoneme",
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "training.max_epochs": 200,
    "train_part": "100",
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
)
idx = df['training.cpc_loss.prediction_steps'] == 1
for n in range(2, 25):
    idx |= (df['training.cpc_loss.prediction_steps'] == n) & (df['training.cpc_loss.gutted_steps'] == (n - 1))
df = df.loc[idx]
check_data(
    df,
    "csa.max_width", "zrc.subset", "zrc.score",
    "zrc.context_mode", "zrc.speaker_mode",
    "training.cpc_loss.prediction_steps",
    "training.cpc_loss.gutted_steps",
)
df = df.pivot_table(
    columns="csa.max_width",
    index="training.cpc_loss.prediction_steps",
    values="zrc.score"
).sort_values(by='training.cpc_loss.prediction_steps', ascending=False)
plot = sns.heatmap(df, annot=True, fmt=".0%", vmin=0.1, vmax=0.5, square=True, cbar=False, ax=axs[1])
plot.set(xlabel='context width', ylabel=None, title='last')
plot.set_xticklabels([f'{int(x):d}' for x in sorted(df.columns)])
plot.set_yticklabels([f'{int(x):d}' for x in sorted(df.index, reverse=True)])

fig.tight_layout()
fig.savefig("../resources/libriabx_prediction_steps_vs_width.pdf")
plt.close(fig)

In [6]:
# aggregate prediction steps over context widths and print as table

df = filter_data_equal(dfz, {
    "zrc.pca_style": "full",
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.loss_type": "cpc",
    'training.cpc_loss.averaging_penalty': 0,
    "zrc.granularity": "phoneme",
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "training.max_epochs": 200,
    "train_part": "100",
})

df_1 = filter_data_in(
    df, {
        "csa.max_width": WIDTHS,
        "training.cpc_loss.prediction_steps": STEPS,
        "training.cpc_loss.gutted_steps": (0,),
    }
)
check_data(
    df_1,
    "csa.max_width", "zrc.subset", "zrc.score",
    "zrc.context_mode", "zrc.speaker_mode",
    "training.cpc_loss.prediction_steps"
)
df_1.loc[:, 'name'] = 'averaged'

df_2 = filter_data_in(
    df, {
        "csa.max_width": WIDTHS,
        "training.cpc_loss.prediction_steps": STEPS,
    }
)
idx = df_2['training.cpc_loss.prediction_steps'] == 1
for n in range(2, 25):
    idx |= (df_2['training.cpc_loss.prediction_steps'] == n) & (df_2['training.cpc_loss.gutted_steps'] == (n - 1))
df_2 = df_2.loc[idx]
check_data(
    df_2,
    "csa.max_width", "zrc.subset", "zrc.score",
    "zrc.context_mode", "zrc.speaker_mode",
    "training.cpc_loss.prediction_steps",
    "training.cpc_loss.gutted_steps",
)
df_2.loc[:, 'name'] = 'last'

df = pd.concat([df_1, df_2])
df['training.cpc_loss.prediction_steps'] = df['training.cpc_loss.prediction_steps'].astype('int')
df = pd.pivot_table(df, values='zrc.score', columns='training.cpc_loss.prediction_steps', index='name')

(df * 100).style.format(precision=1)

training.cpc_loss.prediction_steps,1,3,6,12,24
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
averaged,43.8,26.1,16.6,15.7,20.6
last,43.8,17.5,14.4,16.0,31.6


In [7]:
# various means for stats table
# (main can be derived from the above figure)

row2names = {
    "main":  set(f'cpc.csa{w}/version_{v:04d}' for w in WIDTHS for v in range(101, 106)),
    "long train": set(f'cpc.csa{w}/version_0012' for w in WIDTHS),
    "960h": set(f'cpc.csa{w}/version_0003' for w in WIDTHS),
    "2-layer": set(f'cpc.csa{w}/version_1001' for w in WIDTHS),
    "4-layer": set(f'cpc.csa{w}/version_1101' for w in WIDTHS),
    "conv (fixed size)": set(f"cpc.cconv{w}/version_0101" for w in WIDTHS),
    "conv (fixed H_2)": set(f"cpc.cconv{w}/version_0201" for w in WIDTHS),
    "BEST-RQ": set(f"bestrq.csa{w}/version_0101" for w in WIDTHS),
    "last@6": set(f"cpc.csa{w}/version_0701" for w in WIDTHS),
    "avg@6": set(f"cpc.csa{w}/version_0201" for w in WIDTHS),
}
name2row = dict()
for row, names in row2names.items():
    name2row.update((name, row) for name in names)

df = filter_data_equal(dfz, {"zrc.granularity": "phoneme"})
df = filter_data_in(df, {'name': set(name2row)}).copy()

df['csa.max_width'] = df['csa.max_width'].where(~df['csa.max_width'].isna(), df['cconv.kernel_size']).astype('int')

df['name'] = df['name'].cat.remove_unused_categories().apply(lambda x: name2row[x])


df = pd.pivot_table(df, values='zrc.score', columns='csa.max_width', index='name')

(df * 100).style.format(precision=1)

csa.max_width,2,4,8,16,32,64,128
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2-layer,13.7,15.5,15.8,12.6,13.6,16.8,13.0
4-layer,13.8,13.0,14.1,15.8,15.8,16.8,17.1
960h,,,17.2,,,,18.5
BEST-RQ,27.6,24.1,24.5,26.6,26.1,28.0,25.2
avg@6,18.9,17.2,14.3,16.9,14.3,17.5,17.3
conv (fixed H_2),15.0,14.6,16.9,14.2,50.0,22.3,43.7
conv (fixed size),17.0,14.6,16.8,17.0,19.0,19.4,20.7
last@6,14.0,13.9,13.1,15.4,13.3,14.9,16.0
long train,13.4,13.0,14.0,14.4,14.3,14.7,15.0
main,15.6,14.2,15.1,15.3,14.8,16.9,17.7


In [8]:
# the mean abx error rates for the models we trained downstream ASR systems for
# (also the the mininum ABX of the repeat trials over context widths)

df = filter_data_equal(dfz, {
    "zrc.pca_style": "full",
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.cpc_loss.prediction_steps": 12,
    "training.cpc_loss.gutted_steps": 0,
    "training.cpc_loss.negative_samples": 128,
    "training.loss_type": "cpc",
    "zrc.granularity": "phoneme",
    'training.cpc_loss.averaging_penalty': 0,
    "training.max_epochs": 200,
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "train_part": "100",
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
)
check_data(
    df,
    "csa.max_width", "zrc.subset", "zrc.score", "zrc.context_mode", "zrc.speaker_mode",
)

df = df[['name', 'csa.max_width', 'zrc.score']].groupby(['name']).mean().dropna()
# 5 seeds * 7 context widths
assert len(df) == 5 * 7

min_idxs = []
for w in WIDTHS:
    min_idxs.append(df[df['csa.max_width'] == w]['zrc.score'].idxmin())
min_df_100 = df.loc[min_idxs]
min_df_100['csa.max_width'] = min_df_100["csa.max_width"].astype('int')
min_df_100['zrc.score'] *= 100
min_df_100.style.format(precision=1)

Unnamed: 0_level_0,csa.max_width,zrc.score
name,Unnamed: 1_level_1,Unnamed: 2_level_1
cpc.csa2/version_0105,2,15.3
cpc.csa4/version_0102,4,12.6
cpc.csa8/version_0101,8,13.3
cpc.csa16/version_0103,16,13.8
cpc.csa32/version_0104,32,13.6
cpc.csa64/version_0101,64,16.2
cpc.csa128/version_0101,128,15.9


In [9]:
# our implementations of CPC-Small
name2row = {
    'cpc.small/version_1': 'ours',
    'cpc.small/version_2': 'ours (w/o norm)'
}

df = collate_data(model_yaml_glob='cpc.small/version_[12]/model.yaml')
df = filter_data_equal(df, {'zrc.pca_style': 'full'})
df['name'] = df['name'].cat.remove_unused_categories().apply(lambda x: name2row[x])
df['zrc.score'] *= 100

# mean phoneme rates
df_ = filter_data_equal(df, {'zrc.granularity': 'phoneme'})
print(df_[['name', 'zrc.score']].groupby(['name']).mean())

# rates comparable to zs 2021
df_ = filter_data_equal(df, {'zrc.granularity': 'triphone'})
df_ = filter_data_in(df_, {'zrc.subset': ('dev-clean', 'dev-other')})
df_ = df_[['name', 'zrc.subset', 'zrc.speaker_mode', 'zrc.score']].groupby(['name', 'zrc.subset', 'zrc.speaker_mode']).mean().dropna()
df_.style.format(precision=1)

                 zrc.score
name                      
ours              9.553125
ours (w/o norm)  11.582500


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,zrc.score
name,zrc.subset,zrc.speaker_mode,Unnamed: 3_level_1
ours,dev-clean,across,8.9
ours,dev-clean,within,6.2
ours,dev-other,across,14.9
ours,dev-other,within,8.5
ours (w/o norm),dev-clean,across,12.0
ours (w/o norm),dev-clean,within,8.7
ours (w/o norm),dev-other,across,17.0
ours (w/o norm),dev-other,within,10.5


## Tensorboard analysis

In [10]:
# load data
dft = collate_data("tb")
dft.head(10)

Unnamed: 0,tb.step,tb.epoch,tb.val_loss,name,feat_type,train_part,input_size,latent_type,context_type,version,...,training.best_rq_loss.mask_prob,training.best_rq_loss.mask_width,training.best_rq_loss.codebook_size,training.best_rq_loss.codebook_dim,training.best_rq_loss.offset,training.best_rq_loss.speaker_regex,training.best_rq_loss.prediction_type,training.shuffle,training.max_epochs,training.early_stopping_patience
0,1902,0,0.514025,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
1,3805,1,0.347518,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
2,5708,2,0.275961,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
3,7611,3,0.222729,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
4,9514,4,0.198708,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
5,11417,5,0.2029,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
6,13320,6,0.180398,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
7,15223,7,0.164576,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
8,17126,8,0.161085,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10
9,19029,9,0.145282,bestrq.csa128/version_0101,fbank-80,100,80,ff,csa,101,...,0.01,12.0,8192.0,16.0,0.0,^lbi-([^-]+)-.*$,csa,False,200,10


In [11]:
# regular training

df = filter_data_equal(dft, {
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.loss_type": "cpc",
    "training.cpc_loss.gutted_steps": 0,
    'training.cpc_loss.averaging_penalty': 0,
    "training.cpc_loss.prediction_steps": 12,
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "training.max_epochs": 200,
    "train_part": "100",
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
).copy()

check_data(
    df, "tb.epoch", "tb.val_loss", "tb.step", "csa.max_width"
)
# 5 seeds * 7 context windows
assert len(pd.unique(df['name'])) == 5 * 7

print(df[['csa.max_width', 'tb.val_loss']].groupby(['csa.max_width']).min())

fig = plt.figure(figsize=[6.8, 4.8])
plot = (
    so.Plot(df, x="tb.epoch", y="tb.val_loss", color="csa.max_width")
    .limit(y=(1.9, 3.0), x=(0, 200))
    .add(so.Line(), so.Agg())
    .add(so.Band(), so.Est(errorbar="se"), group="csa.max_width")
    .label(x="epoch", y="CPC validation loss", color="context width")
    .scale(
        color=so.Continuous(trans="log")
            .tick(at=(2, 4, 8, 16, 32, 64, 128))
            .label(like=".0f"))
    .on(fig)
)

plot.plot()

leg = fig.legends[0]
bb = leg.get_bbox_to_anchor().transformed(fig.axes[0].transAxes.inverted())
bb.x0 -= .335
bb.y0 += .41
leg.set_bbox_to_anchor(bb, transform=fig.axes[0].transAxes)

fig.tight_layout()
fig.savefig('../resources/train_loss_vs_width.pdf')
plt.close(fig)

               tb.val_loss
csa.max_width             
2.0               1.948407
4.0               1.882025
8.0               1.931669
16.0              1.954543
32.0              1.921521
64.0              2.017123
128.0             1.978303


In [12]:
# long training
df = filter_data_equal(dft, {
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.loss_type": "cpc",
    "training.cpc_loss.gutted_steps": 0,
    'training.cpc_loss.averaging_penalty': 0,
    "training.cpc_loss.prediction_steps": 12,
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "training.max_epochs": 500,
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
).copy()

check_data(
    df, "tb.epoch", "tb.val_loss", "tb.step", "csa.max_width"
)
# 7 context windows
assert len(pd.unique(df['name'])) == 7

print(df[['csa.max_width', 'tb.val_loss']].groupby(['csa.max_width']).min())


fig = plt.figure(figsize=[6.8, 4.8])
plot = (
    so.Plot(df, x="tb.epoch", y="tb.val_loss", color="csa.max_width")
    .limit(y=(1.7, 3.0), x=(0, 500))
    .add(so.Line(), so.Agg())
    .label(x="epoch", y="CPC validation loss", color="context width")
    .scale(
        color=so.Continuous(trans="log")
            .tick(at=(2, 4, 8, 16, 32, 64, 128))
            .label(like=".0f"))
    .on(fig)
)

plot.plot()

leg = fig.legends[0]
bb = leg.get_bbox_to_anchor().transformed(fig.axes[0].transAxes.inverted())
bb.x0 -= .335
bb.y0 += .41
leg.set_bbox_to_anchor(bb, transform=fig.axes[0].transAxes)

fig.tight_layout()
fig.savefig('../resources/train_loss_vs_width_long.pdf')
plt.close(fig)

               tb.val_loss
csa.max_width             
2.0               1.866130
4.0               1.859689
8.0               1.884088
16.0              1.840316
32.0              1.792857
64.0              1.825067
128.0             1.803738


In [13]:
# BEST-RQ training

df = filter_data_equal(dft, {
    "context_type": "csa",
    "training.loss_type": "best-rq",
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "training.max_epochs": 200,
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
).copy()

check_data(
    df, "tb.epoch", "tb.val_loss", "tb.step", "csa.max_width"
)
# 7 context windows
assert len(pd.unique(df['name'])) == 7

print(df[['csa.max_width', 'tb.val_loss']].groupby(['csa.max_width']).min())


fig = plt.figure(figsize=[6.8, 4.8])
plot = (
    so.Plot(df, x="tb.epoch", y="tb.val_loss", color="csa.max_width")
    .limit(y=(0.03, 0.2), x=(0, 200))
    .add(so.Line(), so.Agg())
    .label(x="epoch", y="BEST-RQ validation loss", color="context width")
    .scale(
        color=so.Continuous(trans="log")
            .tick(at=(2, 4, 8, 16, 32, 64, 128))
            .label(like=".0f"))
    .on(fig)
)

plot.plot()

leg = fig.legends[0]
bb = leg.get_bbox_to_anchor().transformed(fig.axes[0].transAxes.inverted())
bb.x0 -= .335
bb.y0 += .41
leg.set_bbox_to_anchor(bb, transform=fig.axes[0].transAxes)

fig.tight_layout()
fig.savefig('../resources/train_loss_vs_width_bestrq.pdf')
plt.close(fig)

               tb.val_loss
csa.max_width             
2.0               0.044562
4.0               0.072279
8.0               0.074398
16.0              0.050661
32.0              0.054660
64.0              0.058745
128.0             0.062200


In [14]:
# 960-hr training
df = filter_data_equal(dft, {
    "conv.norm_type": "none",
    "context_type": "csa",
    "training.loss_type": "cpc",
    "training.cpc_loss.gutted_steps": 0,
    'training.cpc_loss.averaging_penalty': 0,
    "training.cpc_loss.prediction_steps": 12,
    "csa.dim_feedforward": 1024,
    "csa.num_layers": 1,
    "train_part": "960",
})
df = filter_data_in(
    df, {"csa.max_width": WIDTHS}
).copy()

check_data(
    df, "tb.epoch", "tb.val_loss", "tb.step", "csa.max_width"
)
# 2 context windows
assert len(pd.unique(df['name'])) == 2

df['csa.max_width'] = df['csa.max_width'].astype('int')

print(df[['csa.max_width', 'tb.val_loss']].groupby(['csa.max_width']).min())


fig = plt.figure(figsize=[6.8, 4.8])
plot = (
    so.Plot(df, x="tb.epoch", y="tb.val_loss", color="csa.max_width")
    .limit(y=(1.7, 3.0), x=(0, 100))
    .add(so.Line(), so.Agg())
    .label(x="epoch", y="CPC validation loss", color="context width")
    .scale(
        color=so.Nominal())
    .on(fig)
)

plot.plot()

leg = fig.legends[0]
bb = leg.get_bbox_to_anchor().transformed(fig.axes[0].transAxes.inverted())
bb.x0 -= .7
bb.y0 += .4
leg.set_bbox_to_anchor(bb, transform=fig.axes[0].transAxes)

fig.tight_layout()
fig.savefig('../resources/train_loss_vs_width_960.pdf')
plt.close(fig)

               tb.val_loss
csa.max_width             
8                 1.797562
128               1.746895


## ASR Task

In [15]:
part2model2dname2wer = dict()
for score_file in Path('../exp').glob('cpc.*/version_*/baseline/full_v2000/decoding/**/*.wer.txt'):
    line = score_file.read_text().rstrip().split()
    pth = Path(line[1][1:-1])
    part = pth.parent.name
    model = pth.parents[-3].name + '/' + pth.parents[-4].name
    dname = pth.name[:-9]
    part2model2dname2wer.setdefault(part, dict()).setdefault(model, dict())[dname] = line[-1]

for (part, model2dname2wer) in sorted(part2model2dname2wer.items()):
    print(part)
    for model, dname2wer in sorted(model2dname2wer.items()):
        print(f"\t{model}:", end='')
        for dname, wer in sorted(dname2wer.items()):
            print(f" {dname}={wer},", end='')
        print('')

dev_clean
	cpc.csa128/version_0101: lm_ord4_width32=19.9%, nolm_width32=31.9%,
	cpc.csa16/version_0103: lm_ord4_width32=19.4%, nolm_width32=30.9%,
	cpc.csa2/version_0105: lm_ord4_width32=19.1%, nolm_width32=30.3%,
	cpc.csa32/version_0104: lm_ord4_width32=18.1%, nolm_width32=30.8%,
	cpc.csa4/version_0102: lm_ord4_width32=16.5%, nolm_width32=26.5%,
	cpc.csa64/version_0101: lm_ord4_width32=22.7%, nolm_width32=34.3%,
	cpc.csa8/version_0101: lm_ord4_width32=17.0%, nolm_width32=29.8%,
	cpc.small/version_1: lm_ord2_width32=17.0%, lm_ord3_width32=15.1%, lm_ord4_width32=14.7%, nolm_width32=26.6%,
dev_other
	cpc.csa128/version_0101: lm_ord4_width32=39.6%, nolm_width32=52.0%,
	cpc.csa16/version_0103: lm_ord4_width32=39.9%, nolm_width32=51.4%,
	cpc.csa2/version_0105: lm_ord4_width32=39.4%, nolm_width32=51.5%,
	cpc.csa32/version_0104: lm_ord4_width32=38.4%, nolm_width32=52.1%,
	cpc.csa4/version_0102: lm_ord4_width32=36.6%, nolm_width32=48.0%,
	cpc.csa64/version_0101: lm_ord4_width32=42.9%, nolm_wid