## Preamble

In [1]:
# load packages + declare constants

%load_ext autoreload
%autoreload

import os

import pandas as pd
import numpy as np
import pingouin as pg
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import pingouin as pg

from analysis_utils import *

pio.renderers.default = "vscode"
pio.templates.default = "plotly_white"

# gunzip -c data/local/lm/3-gram.arpa.gz | head -n 3
LM_VOCAB_SIZE = 200_003

FIGS = '../figs'
os.makedirs(FIGS, exist_ok=True)
FIG_TYPE = 'pdf'

COL_SIZE_MM = 80
MID_MARGIN_SIZE_MM = 10

MM_TO_IN = 0.03937008
IN_TO_PX = 96

COL_SIZE_PX = int(COL_SIZE_MM * MM_TO_IN * IN_TO_PX)
MID_MARGIN_SIZE_PX = int(MID_MARGIN_SIZE_MM * MM_TO_IN * IN_TO_PX)

DOUBLE_COL_SIZE_PX = COL_SIZE_PX * 2 + MID_MARGIN_SIZE_PX

FONT_SIZE = 8
FONT_FAMILY = "Times New Roman"
FONT = dict(size=FONT_SIZE, family=FONT_FAMILY)

E_I = "<i>e<sub>i</sub></i>"
E_C = "<i>e<sub>c</sub></i>"
P_I = "<i>p<sub>i</sub></i>"
P_C = "<i>p<sub>c</sub></i>"
H_I = "<i>H<sub>i</sub></i>"
H_C = "<i>H<sub>c</sub></i>"
H_Y = "<i>H<sub>y</sub></i>"

MDL_LATLM_RESLM2RENAME = {
    'wav2vec2-large-960h-lv60_null_null': 'W2V2-L',
    'wav2vec2-base-960h_null_null': 'W2V2-B',
    'tdnn_1d_sp_tgsmall_tgsmall': 'TDNN-3',
    'tdnn_1d_sp_tgsmall_fglarge': 'TDNN-4',
    # 'tdnn_1d_sp_tgsmall_rnnlm_lstm_1a': 'TDNN-R',
    'tri6b_tgsmall_tgsmall': 'GMM-3',
}
MDL_RENAMES = tuple(MDL_LATLM_RESLM2RENAME.values())

PERPLM2RENAME = {
    'tgsmall': '3-gram',
    'fglarge': '4-gram',
    'rnnlm_lstm_1a': 'RNN',
}
PERPLM_RENAMES = tuple(PERPLM2RENAME.values())

PART2RENAME = {
    'PRV': 'CL-P',
    'ROC': 'CL-R',
    'dev-other': 'LS-O',
    'dev-clean': 'LS-C',
}
PART_RENAMES = tuple(PART2RENAME.values())

PERPLM = BINLM = 'rnnlm_lstm_1a'
BIN_QUANT_LOWER = POWER = 0.05
BIN_QUANT_UPPER = 0.95
BIN_PART = 'dev-clean'
BIN_NAMES = ('HP', 'LP', 'ZP')  # order in ascending ent

def format_fig_path(prefix : str, **kwargs) -> str:
    pth = f"{FIGS}/{prefix}"
    for key, vals in sorted(kwargs.items()):
        if isinstance(vals, (str, int, float, bool)):
            vals = (str(vals).lower(),)
        assert isinstance(vals, (set, list, tuple)) and len(vals) and all(isinstance(x, str) for x in vals)
        pth += f'-{key}'
        for val in sorted(vals):
            pth += f"_{val.replace('-', '_')}"
    pth += f'.{FIG_TYPE}'
    return pth


In [2]:
# load tables

print("text_df contents")
text_df = read_text_as_df()
display(text_df.head())

print("perp_df contents")
perp_df = read_perps_as_df()
perp_df = perp_df.merge(text_df[['utt', 'len']], on='utt')
bin_bounds = bin_series(perp_df.loc[(perp_df['perplm'] == BINLM) & (perp_df['part'] == BIN_PART), 'ent'], len(BIN_NAMES), lower_quant=BIN_QUANT_LOWER, upper_quant=BIN_QUANT_UPPER, by_rank=False)[1]
ent_bin = bin_series(perp_df['ent'], bin_bounds, by_rank=False, fmt="{:.01f}")[0]
bin_cats = dict(zip(ent_bin.dtype.categories, BIN_NAMES))
print(bin_cats)
perp_df = perp_df.assign(ent_bin=ent_bin.map(bin_cats))
display(perp_df.head())

print("wer_df contents")
wer_df = read_best_wers_as_df()
display(wer_df.head())

print("uttwer_df contents")
uttwer_df = read_best_uttwers_as_df()
uttwer_df = uttwer_df.merge(text_df[['utt', 'len']], on='utt')
display(uttwer_df.head())

text_df contents


Unnamed: 0,utt,text,part,len
0,lbi-100-121669-0000,TOM THE PIPER'S SON,train-clean-460,4
1,lbi-100-121669-0001,THE PIG WAS EAT AND TOM WAS BEAT AND TOM RAN C...,train-clean-460,15
2,lbi-100-121669-0002,HE NEVER DID ANY WORK EXCEPT TO PLAY THE PIPES...,train-clean-460,36
3,lbi-100-121669-0003,BUT HE WAS SO SLY AND CAUTIOUS THAT NO ONE HAD...,train-clean-460,42
4,lbi-100-121669-0004,AND THEY LIVED ALL ALONE IN A LITTLE HUT AWAY ...,train-clean-460,51


perp_df contents
{'(3.4,4.5]': 'HP', '(4.5,5.6]': 'LP', '(5.6,6.8]': 'ZP'}


Unnamed: 0,utt,perp,perplm,part,ent,len,ent_bin
0,PRV_se0_ag1_f_01_PRV_se0_ag1_f_01_1_002583_002756,188.283,tgmed,PRV,5.237946,11,LP
1,PRV_se0_ag1_f_01_PRV_se0_ag1_f_01_1_002781_002912,2046.096,tgmed,PRV,7.623689,4,
2,PRV_se0_ag1_f_01_PRV_se0_ag1_f_01_1_006175_006438,839.002,tgmed,PRV,6.732213,14,ZP
3,PRV_se0_ag1_f_01_PRV_se0_ag1_f_01_1_006553_006711,399.898,tgmed,PRV,5.99121,8,ZP
4,PRV_se0_ag1_f_01_PRV_se0_ag1_f_01_1_008786_008920,2707.916,tgmed,PRV,7.903935,4,


wer_df contents


Unnamed: 0,wer,ins,del,sub,lmwt,wip,mdl,latlm,reslm,part,snr,acc
0,0.0524,265,298,2287,11,0.0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,30.0,0.9476
1,0.6015,1029,10167,21528,10,0.0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,2.0,0.3985
2,0.2877,740,3749,11163,12,0.0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,8.0,0.7123
3,0.193,589,1905,8006,12,0.0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,11.0,0.807
4,0.8439,441,23839,21628,8,0.0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,-3.0,0.1561


uttwer_df contents


Unnamed: 0,utt,wer,mdl,latlm,reslm,part,snr,acc,len
0,lbi-1272-128104-0000,0.0588,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,30.0,0.9412,17
1,lbi-1272-128104-0001,0.1,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,30.0,0.9,10
2,lbi-1272-128104-0002,0.0312,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,30.0,0.9688,32
3,lbi-1272-128104-0003,0.0417,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,30.0,0.9583,24
4,lbi-1272-128104-0004,0.1618,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,30.0,0.8382,68


## Perplexity

In [3]:
print("proportion bins")
df = perp_df.loc[perp_df['perplm'] == PERPLM]
df = df.groupby('part')['ent_bin'].value_counts(normalize=True, dropna=False).reset_index()
df = df.assign(part=df['part'].map(PART2RENAME)).dropna().pivot(values='proportion', columns='ent_bin', index='part')
df = df.reindex(columns=BIN_NAMES, index=PART_RENAMES)
df['total'] = df.sum(1)
df = (df * 100).round(1)
display(df)
# print(df.to_latex(float_format="{:.01f}".format))

proportion bins


ent_bin,HP,LP,ZP,total
part,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CL-P,23.8,41.9,23.4,89.1
CL-R,18.3,38.7,26.9,84.0
LS-O,39.9,39.5,11.2,90.6
LS-C,37.2,40.1,12.6,89.9


In [4]:
print("entropy/perplexity by partition and LM")
df = agg_mean_by_lens(perp_df, 'len', 'ent', ['part', 'perplm'])
df = df.assign(perplm=df['perplm'].map(PERPLM2RENAME), part=df['part'].map(PART2RENAME)).dropna()
df = df.pivot(values='ent', index='part', columns='perplm')
df = df.reindex(columns=PERPLM_RENAMES, index=PART_RENAMES)
df.loc['mean', :] = df.mean(0)
df['mean'] = df.mean(1)
display(df.round(1))
# print(df.to_latex(float_format="{:.01f}".format))


entropy/perplexity by partition and LM


perplm,3-gram,4-gram,RNN,mean
part,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CL-P,5.5,5.1,5.1,5.2
CL-R,5.7,5.4,5.4,5.5
LS-O,5.6,4.9,4.6,5.1
LS-C,5.7,5.0,4.7,5.1
mean,5.6,5.1,4.9,5.2


In [5]:
print('distribution of per-utt entropy by partition and LM')
df = perp_df.assign(perplm=perp_df['perplm'].map(PERPLM2RENAME), part=perp_df['part'].map(PART2RENAME))
del df['ent_bin']  # don't exclude un-binned terms
df = df.dropna()
fig = px.box(
    df, y='ent', color='perplm', x='part',
    # box=True,
    labels=dict(ent=H_Y, part='Partition', lm="LM", perplm="LM"),
    category_orders=dict(part=PART_RENAMES, perplm=PERPLM_RENAMES),
)
fig.update_traces(marker=dict(size=4), line=dict(width=1))
fig.update_layout(
    legend=dict(orientation="h", yanchor="bottom", y=1.0),
    yaxis=dict(tickangle=270, title_standoff=5),
    margin=dict(l=0, r=10, t=10, b=30),
    font=FONT,
    width=COL_SIZE_PX, height=int(COL_SIZE_PX),
)
# fig.show()
fig.write_image(format_fig_path("violin-ent"))

distribution of per-utt entropy by partition and LM






In [6]:
df = perp_df.assign(perplm=perp_df['perplm'].map(PERPLM2RENAME), part=perp_df['part'].map(PART2RENAME))
del df['ent_bin']
df = df.dropna()
display(pg.normality(df, dv='ent', group='perplm', method='normaltest').round(3))

print("pairwise spearman correlations of entropy across LMs")
df = df.pivot(values='ent', index='utt', columns='perplm')
display(pg.pairwise_corr(df, columns=df.columns, alternative='greater', method='spearman').round(3))


Unnamed: 0_level_0,W,pval,normal
perplm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RNN,2389.619,0.0,False
3-gram,694.098,0.0,False
4-gram,1011.655,0.0,False


pairwise spearman correlations of entropy across LMs


Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,power
0,3-gram,4-gram,spearman,greater,17351,0.889,"[0.89, 1.0]",0.0,1.0
1,3-gram,RNN,spearman,greater,17351,0.783,"[0.78, 1.0]",0.0,1.0
2,4-gram,RNN,spearman,greater,17351,0.859,"[0.86, 1.0]",0.0,1.0


## WER

In [7]:
df = perp_df.loc[perp_df['perplm'] == PERPLM].merge(uttwer_df.loc[np.isinf(uttwer_df['snr'])], on=['utt', 'part', 'len'])

df['mdl'] = (df['mdl'] + '_' + df['latlm'] + '_' + df['reslm']).map(MDL_LATLM_RESLM2RENAME)
df['part'] = df['part'].map(PART2RENAME)

df = agg_mean_by_lens(df, 'len', 'wer',  ['part', 'mdl', 'ent_bin'])
df_all = wer_df.loc[np.isinf(wer_df['snr'])].copy()
df_all['ent_bin'] = 'all'
df_all['mdl'] = (df_all['mdl'] + '_' + df_all['latlm'] + '_' + df_all['reslm']).map(MDL_LATLM_RESLM2RENAME)
df_all['part'] = df_all['part'].map(PART2RENAME)
df = pd.concat([df, df_all])[['part', 'mdl', 'ent_bin', 'wer']]
df = df.dropna()

df = (df.pivot(values='wer', index=['part', 'ent_bin'], columns='mdl') * 100).round(1)
display(df)
# print(df.to_latex(float_format="{:.01f}".format))



Unnamed: 0_level_0,mdl,GMM-3,TDNN-3,TDNN-4,W2V2-B,W2V2-L
part,ent_bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CL-P,HP,73.5,56.2,52.4,50.6,37.8
CL-P,LP,79.2,64.1,62.1,59.2,47.0
CL-P,ZP,83.3,69.0,68.6,66.0,54.3
CL-P,all,78.7,61.9,59.4,58.2,46.2
CL-R,HP,45.8,31.2,26.4,25.2,14.6
CL-R,LP,54.0,38.5,35.2,31.5,20.8
CL-R,ZP,58.1,43.6,41.6,38.2,26.0
CL-R,all,53.9,37.3,33.9,32.8,23.2
LS-C,HP,8.4,3.7,2.4,2.2,1.5
LS-C,LP,11.1,4.9,3.5,3.3,2.2


## Zhang et al

In [8]:
# Zhang et al (2023) "Estimate the noise effect on automatic speech recognition
# accuracy for mandarin by an approach associating articulation index"
# FIXME(sdrobert): the fit is very bad if we use eq. 12

df = wer_df
df = df.assign(mdl=(df['mdl'] + '_' + df['latlm'] + '_' + df['reslm']).map(MDL_LATLM_RESLM2RENAME), part=df['part'].map(PART2RENAME))
df = df.loc[np.isfinite(df['snr'])]
df['acc'] = 1 - df['wer']
df['Acc'] = 100 * df['acc']

snr_min = int(df['snr'].min()) - 1
snr_max = int(df['snr'].max()) + 1
yrange = [0, 100]
ytickvals = list(range(yrange[0], yrange[1] + 1, 25))
xrange = [snr_min, snr_max]
xtickvals = list(range(xrange[0], xrange[1] + 1, 5))
x_interp = np.linspace(snr_min, snr_max, 100)
colours = px.colors.qualitative.Vivid

for part in PART_RENAMES:
    df_ = df.loc[df['part'] == part]

    fig = px.scatter(
        df_, x='snr', y='Acc', color='mdl',
        color_discrete_sequence=colours,
        category_orders=dict(mdl=MDL_RENAMES),
    )
    fig.update_traces(marker=dict(line_width=0.5, size=4))

    for idx, mdl in enumerate(MDL_RENAMES):
        df_mdl = df_.loc[df_['mdl'] == mdl]
        A, B, C = zhang_fit(df_mdl['snr'], df_mdl['acc'], fit_recip=False)
        y_interp = zhang_func(x_interp, A, B, C) * 100
        fig.add_scatter(
            x=x_interp, y=y_interp,
            mode="lines",
            line=dict(color=colours[idx], width=1),
            showlegend=False,
        )

    if part.startswith("LS-"):
        legend = dict(title='Model', yanchor="bottom", y=0.01, xanchor="right", x=0.99)
    else:
        legend = dict(title='Model', yanchor="top", y=0.99, xanchor="left", x=0.01)

    fig.update_layout(
        xaxis=dict(title='SNR (dB)', range=xrange, tickvals=xtickvals, tickformat='d'),
        yaxis=dict(title='Accuracy (%)', range=yrange, tickvals=ytickvals, tickformat='d'),
        legend=legend,
        margin=dict(l=0, r=10, t=10, b=0),
        font=FONT,
    )
    fig.update_xaxes(title_standoff=5)
    fig.update_yaxes(title_standoff=5)
    fig.write_image(format_fig_path("zhang", part=part), width=int(COL_SIZE_PX * 0.75), height=int(COL_SIZE_PX * 0.75), scale=1)












## Boothroyd and Nittrouer

In [9]:
# A. Boothroyd and S. Nittrouer (1988) "Mathematical treatment of context effects in
# phoneme and word recognition"

# idealized curve, following B&N's ZP -> LP, ZP -> HP
k_low = 1.38
k_high = 2.72
k_max = 500
range_ = [0, 100]
num_pts = 100
tickvals = list(range(range_[0], range_[1] + 1, 25))

x = np.linspace(0.0, 1.0, num_pts)

fig = go.Figure()
for k, x0, ax, ay in ((1, .66, 20, 20), (k_low, .44, 30, 30), (k_high, .22, 40, 40), (k_max, 0.02, 25, 25)):
    fig.add_scatter(
        x=100 * x,
        y=100 * (1 - boothroyd_func(1 - x, k)),
        showlegend=False,
        line=dict(color='black')
    )
    fig.add_annotation(
        x=100 * x0, y=100 * (1 - boothroyd_func(1 - x0, k)),
        text=f"<i>k</i> = {k}",
        showarrow=True,
        arrowhead=1,
        ax=ax, ay=ay,
        arrowcolor="black",
    )
fig.update_layout(
    width=COL_SIZE_PX // 1.6, height=COL_SIZE_PX // 1.6,
    margin=dict(l=0, r=0, t=0, b=0),
    font=FONT,
    xaxis=dict(title=f"Accuracy {P_I} (%)", range=range_, tickvals=tickvals, mirror=True, showline=True),
    yaxis=dict(title=f"Accuracy {P_C} (%)", range=range_, tickvals=tickvals, mirror=True, showline=True),
)
fig.update_xaxes(title_standoff=5)
fig.update_yaxes(title_standoff=5)
fig.write_image(format_fig_path('bn'))

In [10]:
# actual fits and plots

print('merging')
df = perp_df.loc[perp_df['perplm'] == PERPLM].merge(uttwer_df, on=['utt', 'part', 'len'])
df = df.assign(mdl=(df['mdl'] + '_' + df['latlm'] + '_' + df['reslm']).map(MDL_LATLM_RESLM2RENAME), part=df['part'].map(PART2RENAME))
df = agg_mean_by_lens(df, 'len', 'wer',  ['snr', 'part', 'mdl', 'ent_bin'])
df['lwer'] = np.log(df['wer'])
df['Wer'] = 100 * df['wer']
df['acc'] = 1 - df['wer']
df['Acc'] = 100 * df['acc']
df = df.loc[np.isfinite(df['snr'])].dropna()

mask = df['ent_bin'] == BIN_NAMES[-1]
df, df_out = df.loc[~mask], df.loc[mask]
df = df.merge(df_out, on=['snr', 'part', 'mdl'], suffixes=('_in', '_out'))
df = df.assign(ent_bin_in=df.ent_bin_in.cat.remove_unused_categories())
df['k'] = df['lwer_in'] / df['lwer_out']

fits = []
print('fitting all')
fit = boothroyd_fit(df)
fit['mdl'], fit['part'] = 'all', 'all'
# go.Figure().add_histogram(x=fit.iloc[0]['bootstrap']).show()

fits.append(fit)

for part in PART_RENAMES:
    print(f'fitting {part}')
    fit = boothroyd_fit(df.loc[df['part'] == part])
    fit['mdl'], fit['part'] = 'all', part
    fits.append(fit)

for mdl in MDL_RENAMES:
    print(f'fitting {mdl}')
    fit = boothroyd_fit(df.loc[df['mdl'] == mdl])
    fit['mdl'], fit['part'] = mdl, 'all'
    fits.append(fit)
    for part in PART_RENAMES:
        print(f'fitting {mdl} x {part}')
        fit = boothroyd_fit(df.loc[(df['mdl'] == mdl) & (df['part'] == part)])
        fit['mdl'], fit['part'] = mdl, part
        fits.append(fit)

fits = pd.concat(fits)
fits['coef+ci'] = fits.apply(lambda row: f"{row['coef']:.2f} [{row['ci_low']:.2f}, {row['ci_high']:.2f}]", axis=1)
fits = fits.pivot(values='coef+ci', columns='name', index=['mdl', 'part'])

display(fits)
# print(fits.swaplevel(0, 1).to_latex())

range_ = [0, 100]
x_interp = np.linspace(*range_, 100)
tickvals = list(range(range_[0], range_[1] + 1, 25))
colours = px.colors.qualitative.Vivid

for mdl in MDL_RENAMES:
    df_ = df.loc[df['mdl'] == mdl]
    fig = px.scatter(
        df_, x='Acc_out', y='Acc_in',
        # symbol='ent_bin_in', color='part',
        symbol='part', color='ent_bin_in',
        color_discrete_sequence=colours,
        symbol_sequence=list(range(100)),
        category_orders=dict(ent_bin_in=BIN_NAMES, part=PART_RENAMES),
    )
    for i, trace in enumerate(fig.data):
        if trace.mode == 'markers':
            name = trace.name.split(', ')
            if name[0] in BIN_NAMES[1:]:
                trace['name'] = ''
                trace['showlegend'] = False
            else:
                trace['name'] = name[1]
    for bin in range(len(BIN_NAMES) - 1):
        fig.add_scatter(
            y=[None], mode='markers',
            # marker=dict(color='black', symbol=bin),
            marker=dict(color=colours[bin], symbol=0),
            legend="legend2",
            name=BIN_NAMES[bin],
        )
        k = float(fits.loc[mdl, 'LS-C'][BIN_NAMES[bin]].split(" ")[0])
        fig.add_scatter(
            x=x_interp,
            y=(1 - boothroyd_func(1 - x_interp / 100, k)) * 100,
            line=dict(color="black", width=1),
            showlegend=False,
    )
    fig.add_scatter(
        x=range_,
        y=range_,
        mode='lines',
        line=dict(color="grey", width=1, dash='dash'),
        showlegend=False,
    )
    fig.update_traces(marker=dict(line_width=0.5, size=4))
    fig.update_layout(
        margin=dict(l=0, r=0, t=0, b=0, pad=0),
        font=FONT,
        legend=dict(
            title_text='Partition',
            yanchor="bottom",
            y=0.01,
            xanchor="right",
            x=0.99,
            bgcolor='rgba(0,0,0,0)',
        ),
        legend2=dict(
            title_text="In-context bin",
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01,
            bgcolor='rgba(0,0,0,0)',
        ),
        xaxis=dict(title=f"Accuracy {P_I} (%)", range=range_, tickvals=tickvals, mirror=True, showline=True),
        yaxis=dict(title=f"Accuracy {P_C} (%)", range=range_, tickvals=tickvals, mirror=True, showline=True),
    )
    fig.update_xaxes(title_standoff=5)
    fig.update_yaxes(title_standoff=5)
    fig.write_image(format_fig_path('acc-ratio', mdl=mdl), width=int(COL_SIZE_PX * 0.75), height=int(COL_SIZE_PX * 0.75), scale=1)
    # fig.show()

    fig = px.scatter(
        df_.loc[df_['part'].isin({'LS-C', 'CL-P'})], x='Wer_out', y='k', 
        symbol_sequence=list(range(100)),
        category_orders=dict(part=('LS-C', 'CL-P'), ent_bin_in=BIN_NAMES),
        color='part', symbol='ent_bin_in',
        color_discrete_sequence=colours,
    )
    for i, trace in enumerate(fig.data):
        if trace.mode == 'markers':
            name = trace.name.split(', ')
            if name[1] in BIN_NAMES[1:]:
                trace['name'] = ''
                trace['showlegend']=False
            else:
                trace['name'] = name[0]
    for bin in range(len(BIN_NAMES) - 1):
        fig.add_scatter(
            y=[None], mode='markers',
            marker=dict(color='black', symbol=bin),
            legend="legend2",
            name=BIN_NAMES[bin],
        )
        k = float(fits.loc[mdl, 'LS-C'][BIN_NAMES[bin]].split(" ")[0])
        fig.add_scatter(
            x=[0, 101],
            y=[k, k],
            mode="lines",
            line=dict(color="black", width=1),
            showlegend=False,
        )
    fig.update_traces(marker=dict(line_width=0.5, size=4))
    fig.update_layout(
        margin=dict(l=0, r=0, t=0, b=0),
        font=FONT,
        legend=dict(
            title_text='Partition',
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01,
            bgcolor='rgba(0,0,0,0)',
        ),
        legend2=dict(
            title_text=f"In-context bin",
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.37,
            bgcolor='rgba(0,0,0,0)',
        ),
        xaxis=dict(title=f"Error rate {E_I} (%)", range=[0, 101], tickvals=tickvals, mirror=True, showline=True),
        yaxis=dict(title=f"Pointwise <i>k</i>", range=[1, 2.5], tickvals=[1, 1.5, 2, 2.5], mirror=True, showline=True),
    )
    fig.update_xaxes(title_standoff=5)
    fig.update_yaxes(title_standoff=5)
    fig.write_image(format_fig_path('point-k', mdl=mdl), width=int(COL_SIZE_PX * 0.75), height=int(COL_SIZE_PX * 0.75), scale=1)
    # fig.show()

merging
fitting all
fitting CL-P
fitting CL-R
fitting LS-O
fitting LS-C
fitting W2V2-L
fitting W2V2-L x CL-P
fitting W2V2-L x CL-R
fitting W2V2-L x LS-O
fitting W2V2-L x LS-C
fitting W2V2-B
fitting W2V2-B x CL-P
fitting W2V2-B x CL-R
fitting W2V2-B x LS-O
fitting W2V2-B x LS-C
fitting TDNN-3
fitting TDNN-3 x CL-P
fitting TDNN-3 x CL-R
fitting TDNN-3 x LS-O
fitting TDNN-3 x LS-C
fitting TDNN-4
fitting TDNN-4 x CL-P
fitting TDNN-4 x CL-R
fitting TDNN-4 x LS-O
fitting TDNN-4 x LS-C
fitting GMM-3
fitting GMM-3 x CL-P
fitting GMM-3 x CL-R
fitting GMM-3 x LS-O
fitting GMM-3 x LS-C


Unnamed: 0_level_0,name,HP,LP
mdl,part,Unnamed: 2_level_1,Unnamed: 3_level_1
GMM-3,CL-P,"1.66 [1.65, 1.68]","1.28 [1.26, 1.29]"
GMM-3,CL-R,"1.50 [1.48, 1.52]","1.19 [1.18, 1.20]"
GMM-3,LS-C,"1.34 [1.33, 1.36]","1.21 [1.20, 1.22]"
GMM-3,LS-O,"1.43 [1.42, 1.44]","1.24 [1.23, 1.25]"
GMM-3,all,"1.43 [1.42, 1.45]","1.22 [1.21, 1.23]"
TDNN-3,CL-P,"1.58 [1.57, 1.59]","1.22 [1.22, 1.23]"
TDNN-3,CL-R,"1.44 [1.42, 1.46]","1.17 [1.17, 1.18]"
TDNN-3,LS-C,"1.31 [1.30, 1.33]","1.19 [1.18, 1.20]"
TDNN-3,LS-O,"1.33 [1.31, 1.34]","1.17 [1.16, 1.19]"
TDNN-3,all,"1.41 [1.39, 1.43]","1.19 [1.18, 1.19]"
























## Klakow and Peters

In [11]:
# Klakow and Peters (2002). "Testing the correlation of word error rate and perplexity"
# "... slope a is smaller for tasks that are acoustically more challenging. Hence on
# those tasks larger reductions in PP are needed to obtain a given reduction in WER." 

print('merging')
df = perp_df.loc[perp_df['perplm'] == PERPLM].merge(uttwer_df, on=['utt', 'part', 'len'])
df = df.assign(mdl=(df['mdl'] + '_' + df['latlm'] + '_' + df['reslm']).map(MDL_LATLM_RESLM2RENAME), part=df['part'].map(PART2RENAME))

df_ent = agg_mean_by_lens(df, 'len', 'ent', ['part', 'ent_bin'])
print('entropy by bin and partion')
display(df_ent.round(3))

df = agg_mean_by_lens(df, 'len', 'wer',  ['snr', 'part', 'mdl', 'ent_bin'])
df['lwer'] = np.log(df['wer'])
df['Wer'] = 100 * df['wer']
df = df.loc[np.isfinite(df['snr'])].dropna()

snrs = df['snr'].unique()
snrs.sort()
snr_mini, snr_midi, snr_maxi = 0, 16, len(snrs) - 1
df = df.loc[(df['snr'] >= snrs[snr_mini]) & (df['snr'] <= snrs[snr_maxi])]

range_ = [0, 100]
tickvals = list(range(range_[0], range_[1] + 1, 25))

fits = []
for part in PART_RENAMES:
    x = df_ent.loc[df_ent['part'] == part, 'ent']
    for mdl in MDL_RENAMES:
        df_ = df.loc[(df['part'] == part) & (df['mdl'] == mdl)]

        fig = px.bar(
            df_,
            x='ent_bin',
            y='Wer',
            color='snr',
            barmode='overlay',
            color_continuous_scale="viridis",
            opacity=1.0,
            category_orders=dict(ent_bin=BIN_NAMES),
            labels=dict(snr="SNR (dB)"),
        )

        fits = []
        for snr_idx in (snr_mini, snr_midi, snr_maxi):
            snr = snrs[snr_idx]
            y = df_.loc[df_['snr'] == snr, "lwer"]
            fit = pg.linear_regression(x, y, True)
            fit['snr'] = snr
            fit['part'] = part
            fit['mdl'] = mdl
            fits.append(fit)
            a = fit.loc[fit['names'] == 'ent', 'coef'].iloc[0]
            b = np.exp(fit.loc[fit['names'] == 'Intercept', 'coef'].iloc[0])
            y_interp = klakow_func(np.exp(x), a, b) * 100
            fig.add_scatter(
                x=BIN_NAMES,
                y=y_interp,
                showlegend=False,
                mode='markers+lines',
                marker=dict(color='black', size=4),
                line=dict(color='black', width=1),
            )

        fig.update_layout(
            margin=dict(l=0, r=0, t=0, b=0, pad=0),
            font=FONT,
            xaxis=dict(title=f"Context bin"),
            yaxis=dict(title=f"Error rate {E_C} (%)", range=range_, tickvals=tickvals, mirror=True, showline=True),
        )
        fig.update_xaxes(title_standoff=5)
        fig.update_yaxes(title_standoff=5)
        fig.write_image(format_fig_path("kp-over-snr", part=part, mdl=mdl),  width=int(COL_SIZE_PX * 0.75), height=int(COL_SIZE_PX * 0.75), scale=1)
        
display(pd.concat(fits).round(3))


merging
entropy by bin and partion


Unnamed: 0,index,part,ent_bin,ent
0,0,CL-P,HP,4.097
1,1,CL-P,LP,5.042
2,2,CL-P,ZP,6.1
3,3,CL-R,HP,4.078
4,4,CL-R,LP,5.088
5,5,CL-R,ZP,6.125
6,6,LS-C,HP,4.054
7,7,LS-C,LP,4.982
8,8,LS-C,ZP,6.025
9,9,LS-O,HP,4.048



invalid value encountered in scalar divide


invalid value encountered in divide


invalid value encountered in scalar divide


invalid value encountered in divide



Unnamed: 0,names,coef,se,T,pval,r2,adj_r2,CI[2.5%],CI[97.5%],snr,part,mdl
0,Intercept,-0.006,0.001,-6.191,0.102,0.657,0.315,-0.018,0.006,-10.0,LS-C,GMM-3
1,ent,0.0,0.0,1.385,0.398,0.657,0.315,-0.002,0.003,-10.0,LS-C,GMM-3
0,Intercept,-0.439,0.009,-51.053,0.012,0.998,0.997,-0.549,-0.33,6.0,LS-C,GMM-3
1,ent,0.042,0.002,24.578,0.026,0.998,0.997,0.02,0.063,6.0,LS-C,GMM-3
0,Intercept,-3.457,0.126,-27.549,0.023,0.993,0.986,-5.052,-1.863,30.0,LS-C,GMM-3
1,ent,0.291,0.025,11.809,0.054,0.993,0.986,-0.022,0.605,30.0,LS-C,GMM-3
