## Preamble

In [54]:
# load packages

%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import pingouin as pg
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

from itertools import product
from scipy.optimize import curve_fit
from analysis_utils import *
from sklearn.metrics import r2_score

pio.renderers.default = "vscode"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# load tables

print("perp_df contents")
perp_df = read_perps_as_df()
display(perp_df.head())

print("text_df contents")
text_df = read_text_as_df()
display(text_df.head())

print("wer_df contents")
wer_df = read_best_wers_as_df()
display(wer_df.head())

print("uttwer_df contents")
uttwer_df = read_best_uttwers_as_df()
display(uttwer_df.head())

## Perplexity

In [None]:
print("entropy/perplexity by partition and LM")
df = agg_mean_by_lens(perp_df, text_df['len'], 'ent', ['part', 'perplm'])
df['perp'] = np.exp(df['ent'])
df = df.pivot(values=['ent', 'perp'], index='part', columns='perplm')
display(df.round(2))


In [None]:
print('distribution of per-utt entropy by partition and LM')
fig = px.violin(
    perp_df, x='ent', y='part', color='perplm',
    box=True,
    labels=dict(ent='Entropy (nats)', part='Partition', lm="LM"),
    width=600, height=800,
)
fig.show()

In [None]:
print('Test of normality of entropy given LM')
display(pg.normality(perp_df, dv='ent', group='perplm', method='normaltest').round(3))

print("pairwise spearman correlations of entropy across LMs")
df = perp_df.pivot(values='ent', index='utt', columns='perplm')
display(pg.pairwise_corr(df, columns=df.columns, alternative='greater', method='spearman').round(3))

print("scatter plot matrix of per-utterance entropy of each LM")
fig = px.scatter_matrix(df, dimensions=df.columns, opacity=0.1)
fig.show()

In [None]:
print("per-utterance perplexity vs. rank by LM")

df = perp_df.copy()
df['rank'] = df.groupby(['perplm'])['perp'].rank()

fig = px.scatter(df, x='rank', y='perp', color='perplm', log_y=True)
fig.show()

## WER

In [None]:
part = 'dev-clean'
latlm = reslm = 'tgsmall'
mdl = 'tdnn_1d_sp'
desc = f"({part} partition, {mdl} model, {latlm} lattice LM, and {reslm} rescoring lm)"

df = uttwer_df.loc[
    (~uttwer_df['snr'].isnull()) &
    (uttwer_df['latlm'] == latlm) &
    (uttwer_df['reslm'] == reslm) &
    (uttwer_df['part'] == part) &
    (uttwer_df['mdl'] == mdl)
].copy()
df['snr'] = df['snr'].astype('int')

with pd.option_context('display.max_rows', 10):
    print(f"test of normality of per-utterance WERs given SNR {desc}")
    display(pg.normality(df, dv='wer', group='snr', method='normaltest').round(3).sort_index())


    print(f"spearman correlation of WERs across SNRs {desc}")
    df = df.pivot(values='wer', index='utt', columns='snr')
    display(pg.pairwise_corr(df, columns=df.columns, alternative='greater', method='spearman').round(3).sort_index())

print(f"scatter plot matrix of per-utterance WERs of select SNRs {desc}")
fig = px.scatter_matrix(df, dimensions=[5, 10, 20, 30], opacity=0.1)
fig.update_layout({"xaxis"+str(i+1): dict(range = [-0.1, 1]) for i in range(len(df.columns))})
fig.update_layout({"yaxis"+str(i+1): dict(range = [-0.1, 1]) for i in range(len(df.columns))})
fig.show()

In [None]:
# Zhang et al (2023) "Estimate the noise effect on automatic speech recognition
# accuracy for mandarin by an approach associating articulation index"
# FIXME(sdrobert): the fit is very bad if we use eq. 12

latlm = 'tgsmall'
reslm = 'tgsmall'
part = 'dev-clean'
desc = f"({part} partition, {latlm} lattice LM, and {reslm} rescoring lm)"
num_points = 100

df = wer_df.loc[
    (wer_df['latlm'] == latlm) &
    (wer_df['reslm'] == reslm) &
    (wer_df['part'] == part)
].copy()

idx = df['snr'].isnull()
df, Ainvs = df.loc[~idx], df.loc[idx, ['mdl', 'acc']]
snr_min = df['snr'].min() - 1
snr_max = df['snr'].max() + 1
x_interp = np.linspace(snr_min, snr_max, num_points)

mdls = df['mdl'].unique()
assert all(mdls == Ainvs['mdl'].unique())
ratio = num_points // (len(mdls) + 2)

def zhang_func(x : np.ndarray, A : float, B : float, C : float) -> np.ndarray:
    return 1 / (np.exp(-(x + B) / C) + A)


fit = []
fig = go.Figure()
for mdl_idx, mdl in enumerate(mdls):
    colour = px.colors.qualitative.Plotly[mdl_idx]
    df_ = df.loc[df['mdl'] == mdl]
    Ainv = Ainvs.loc[Ainvs['mdl'] == mdl, 'acc'].iloc[0]
    A_init = 1 / Ainv
    N = len(df_)
    x = df_['snr'].array
    y = df_['acc'].array
    (A, B, C), _ = curve_fit(
        zhang_func, x, y,
        p0=(A_init, 0, 1),
        bounds=([1, -np.inf, 0.01], [np.inf, np.inf, np.inf]),
    )
    num = np.square(y - zhang_func(x, A, B, C), dtype=np.float64).sum()
    denom = np.square(y - y.mean()).sum()
    r2 = 1 - num / denom
    fit.append(dict(mdl=mdl, A=A, B=B, C=C, r2=r2))
    y_interp = 1 / (A + np.exp(-(x_interp + B) / C))
    fig.add_scatter(
        x=x, y=df_['acc'] * 100,
        name=mdl, mode='markers',
        marker=dict(color=colour),
    )
    fig.add_scatter(
        x=x_interp, y=y_interp * 100,
        mode='lines',
        opacity=0.5,
        showlegend=False,
        line=dict(color=colour),
    )
    fig.add_annotation(
        x=x_interp[ratio * (mdl_idx + 1)], y=y_interp[ratio * (mdl_idx + 1)] * 100,
        text=f"A={A:.02f},B={B:.02f},C={C:.02f}",
        showarrow=True,
        font=dict(color=colour),
    )
print(f"Zhang et al fits by model {desc}")
display(pd.DataFrame.from_records(fit).round(3))

print(f"accuracy (inv. WER) by SNR across models w/ Zhang et al fits {desc}")
fig.update_layout(
    xaxis_title="SNR (dB)",
    yaxis_title="accuracy (%)",
    legend_title="model",
    xaxis_tickformat='d',
    yaxis_tickformat='d',
    xaxis_range=[snr_min, snr_max],
    yaxis_range=[0, 100],
)
fig.show()


## Perplexity vs. WER

In [None]:
# boothroyd's k

latlm = reslm = perplm = binlm = 'tgsmall'
mdl = 'tdnn_1d_sp'
num_bins = 5
num_points = 100
binpart = 'dev-other'
part = 'dev-clean'
x_interp = np.linspace(0.01, 100, num_points)
ratio = num_points // (num_bins + 2)
add_intercept = False
print(
    f"mdl {mdl}, part {part} lattice lm {latlm}, rescore lm {reslm} perplexity LM "
    f"{perplm}, bin part {binpart}, bin LM {binlm}"
)

df = perp_df.loc[(perp_df['perplm'] == perplm) & (perp_df['part'] == part)].copy()
bounds = bin_series(perp_df.loc[(perp_df['perplm'] == binlm) & (perp_df['part'] == binpart), 'perp'], num_bins)[1]
df['perp_bin'] = bin_series(df['perp'], bounds, by_rank=False, fmt="{:.0f}")[0]
bin_cats = df['perp_bin'].dtype.categories

print("mean entropy by bin and ratio (highest/bin)")
df_ent = agg_mean_by_lens(df, text_df['len'], 'ent', 'perp_bin')
df_ent['ratio'] = df_ent.loc[df_ent['perp_bin'] == bin_cats[num_bins - 1], 'ent'].iloc[0] / df_ent['ent']
display(df_ent.round(3))

df = df.merge(
    uttwer_df.loc[
        (uttwer_df['reslm'] == reslm) &
        (uttwer_df['latlm'] == latlm) &
        (uttwer_df['mdl'] == mdl)
    ], on=['utt', 'part'])

df = agg_mean_by_lens(df, text_df['len'], 'wer', ['snr', 'perp_bin'])
df = df.pivot(values='wer', index='snr', columns='perp_bin')

fig_acc, fig_loge = go.Figure(), go.Figure()
x = df[bin_cats[num_bins - 1]]

fits = []
for bin in range(num_bins):
    y = df[bin_cats[bin]]
    fit : pd.DataFrame = pg.linear_regression(np.log(x), np.log(y), add_intercept=add_intercept)
    iv_name, int_name = f"iv {bin_cats[bin]}", f"int {bin_cats[bin]}"
    fit['names'] = fit['names'].map({bin_cats[num_bins - 1]: iv_name, "Intercept": int_name})
    if add_intercept:
        int_ = np.exp(fit.loc[fit['names'] == int_name, 'coef'].iloc[0])
    else:
        int_ = 1.0
    fits.append(fit)
    k = fit.loc[fit['names'] == iv_name, 'coef'].iloc[0]
    colour = px.colors.qualitative.Plotly[bin]
    y_interp = 100 * (1 - int_ * (1 - x_interp / 100) ** k)
    interp_name = f"k={k:.02f}" + (f", i={int_:.02f}" if add_intercept else "")
    fig_acc.add_scatter(
        x=100 - x * 100, y=100 - y * 100,
        name=bin_cats[bin], mode='markers',
        marker=dict(color=colour),
    )
    fig_acc.add_scatter(
        x=x_interp, y=y_interp,
        name=interp_name,
        showlegend=False, mode='lines', opacity=0.5,
        line=dict(color=colour),
    )
    fig_acc.add_annotation(
        x=x_interp[ratio * (bin + 1)], y=y_interp[ratio * (bin + 1)],
        text=interp_name,
        showarrow=True,
        opacity=1,
        font=dict(color=colour),
    )
    fig_loge.add_scatter(
        x=100 * x, y=100 * y,
        name=bin_cats[bin], mode='markers',
        marker=dict(color=colour),
    )
    fig_loge.add_scatter(
        x=100 - x_interp, y=100 - y_interp,
        showlegend=False, mode='lines', opacity=0.5,
        name=interp_name,
        line=dict(color=colour),
    )
    fig_loge.add_annotation(
        x=np.log10(100 - x_interp[ratio * (bin + 1)]), y=np.log10(100 - y_interp[ratio * (bin + 1)]),
        text=interp_name,
        showarrow=True,
        opacity=1,
        font=dict(color=colour),
    )
print("Boothroyd & Nittrouer model fits")
display(pd.concat(fits))

print("in-context vs. out-of-context accuracy and B & N fits")
fig_acc.update_layout(
    xaxis_title="out-of-context accuracy (%)",
    yaxis_title="in-context accuracy (%)",
    legend_title="in-context perp",
    xaxis_tickformat='d',
    yaxis_tickformat='d',
    xaxis_range=[0, 100],
    yaxis_range=[0, 100],
    width=800, height=400,
)
fig_acc.show()
print("in-context vs. out-of-context error rates and B & N fits")
fig_loge.update_layout(
    xaxis_title="out-of-context error rate (%)",
    yaxis_title="in-context error rate (%)",
    legend_title="in-context perp",
    width=800, height=400,
)
lims = np.log10(df[bin_cats[0]].min() * 100 - 1), np.log10(100)
fig_loge.update_xaxes(type='log', range=lims)
fig_loge.update_yaxes(type='log', range=lims)
fig_loge.show()


In [None]:
# wer by perp

mdl = 'tdnn_1d_sp'
latlm = reslm = perplm = 'tgsmall'
num_points = 100
part = 'dev-clean'
print(
    f"mdl {mdl}, partition {part}, lattice LM {latlm}, rescore LM {reslm}, "
    f"perlexity LM {perplm}"
)

df = perp_df.loc[(perp_df['perplm'] == perplm) & (perp_df['part'] == part)]
df = df.merge(uttwer_df.loc[
    (uttwer_df['reslm'] == reslm) &
    (uttwer_df['latlm'] == latlm) &
    (uttwer_df['mdl'] == mdl)
], on=['utt', 'part'])
df = df.loc[df['snr'].isnull()]  # without noise
ymin, ymax = df['wer'].quantile(0.05), df['wer'].quantile(0.95)
xmin, xmax = df['perp'].quantile(0.05), df['perp'].quantile(0.95)
perp_interp = np.linspace(xmin, xmax, num_points)

print("per-utterance WER by perplexity")
fig = px.scatter(df, x='perp', y='wer')
fig.update_xaxes(type='log', range=[np.log10(xmin), np.log10(xmax)])
fig.update_yaxes(range=[ymin, ymax])
fig.show()

In [None]:
# Klakow and Peters (2002). "Testing the correlation of word error rate and perplexity"
# "... slope a is smaller for tasks that are acoustically more challenging. Hence on
# those tasks larger reductions in PP are needed to obtain a given reduction in WER." 

latlm = reslm = perplm = binlm = 'tgsmall'
mdl = 'tdnn_1d_sp'
num_bins = 5
num_points = 100
part = binpart = 'dev-clean'
print(
    f"mdl {mdl}, part {part} lattice lm {latlm}, rescore lm {reslm} perplexity LM "
    f"{perplm}, bin part {binpart}, bin LM {binlm}"
)

def klakow_func(perp : np.ndarray, a : float, b: float) -> np.ndarray:
    return b * (perp ** a)

df = perp_df.loc[(perp_df['perplm'] == perplm) & (perp_df['part'] == part)].copy()
bounds = bin_series(perp_df.loc[(perp_df['perplm'] == binlm) & (perp_df['part'] == binpart), 'perp'], num_bins)[1]
bins = bin_series(df['perp'], bounds, by_rank=False, fmt="{:.0f}")[0]
df['perp_bin'] = bins
x = agg_mean_by_lens(df, text_df['len'], 'ent', 'perp_bin')['ent']

df = df.merge(uttwer_df.loc[
    (uttwer_df['reslm'] == reslm) &
    (uttwer_df['latlm'] == latlm) &
    (uttwer_df['mdl'] == mdl)
], on=['utt', 'part'])
df = agg_mean_by_lens(df, text_df['len'], 'wer', ['snr', 'perp_bin', 'mdl'])
df = df.reset_index()

snrs = df['snr'].unique()
snrs.sort()
fits = []
curve_params_list = []
for snr in snrs:
    snr_mask = df['snr'] == snr
    y = np.log(df.loc[df['snr'] == snr, "wer"])
    fit : pd.DataFrame = pg.linear_regression(x, y)
    curve_params_list.append({
        "snr": snr,
        "a": fit.loc[fit['names'] == 'ent', 'coef'].iloc[0],
        "b": np.exp(fit.loc[fit['names'] == 'Intercept', 'coef'].iloc[0]),
    })
    iv_name, int_name = f"iv {int(snr)}", f"int {int(snr)}"
    fit['names'] = fit['names'].map({'ent': iv_name, "Intercept": int_name})
    fits.append(fit)
print("regression fits for Klakow and Peters models")
display(pd.concat(fits))

snr_mini, snr_midi, snr_maxi = 10, 16, len(snrs) - 10
df = df.loc[(df['snr'] >= snrs[snr_mini]) & (df['snr'] <= snrs[snr_maxi])]
df['wer'] *= 100

print("WER by (PP, SNR) with select K & P fits")
fig = px.bar(df, x='perp_bin', y='wer', color='snr', barmode='overlay', color_continuous_scale="viridis", opacity=1.0)
for dict_ in (curve_params_list[snr_mini], curve_params_list[snr_midi], curve_params_list[snr_maxi]):
    y = klakow_func(np.exp(x), dict_['a'], dict_['b']) * 100
    interp_name = f"a={dict_['a']:.03f}, b={dict_['b']:.03f} WER ∈ [{y.min():.02f},{y.max():.02f}]"
    fig.add_scatter(
        x=bins.dtype.categories,
        y=y,
        showlegend=False,
        name=interp_name,
        mode='markers+lines',
        marker=dict(color='red'), line=dict(color='red'))
    fig.add_annotation(
        x=bins.dtype.categories[0], y=y.iloc[0],
        text=interp_name,
        showarrow=True,
        opacity=1,
        font=dict(color="black"),
        bgcolor='white',
    )
fig.update_layout(
    yaxis_range=[0, 100]
)
fig.show()

print("K & P model parameters by SNR")
df = pd.DataFrame.from_records(curve_params_list)
df = pd.melt(df, ['snr'], ['a', 'b'], var_name='param', value_name='val')
fig = px.scatter(df, x='snr', y='val', color='param')
fig.update_layout(yaxis_range=[0, 1])
fig.show()

In [88]:
# boothroyd prediction
num_bins = 3
mdls = ('tdnn_1d_sp',)
parts = ('dev-clean', 'dev-other')
perplms = ('tgsmall', 'fglarge')
latlms = ('tgsmall',)
reslms = ('tgsmall',)
snrs = uttwer_df.loc[~uttwer_df['snr'].isnull(), 'snr'].unique()
snrs.sort()

train_fits = dict()
train_bounds = dict()
test_fits = dict()
for train in (True, False):
    for train_key in product(mdls, parts, perplms, latlms, reslms):
        if train:
            keys = [train_key]
        else:
            keys = product(mdls, parts, perplms, latlms, reslms)
            bounds = train_bounds[train_key]
        for key in keys:
            mdl, part, perplm, latlm, reslm = key
            df = perp_df.loc[(perp_df['perplm'] == perplm) & (perp_df['part'] == part)].copy()
            if train:
                bins, train_bounds[train_key] = bin_series(df['perp'], num_bins)
            else:
                if key == train_key:
                    continue
                bins = bin_series(df['perp'], bounds, by_rank=False)[0]
            bin_cats = bins.dtype.categories
            df['perp_bin'] = bins
            df = df[['utt', 'perp_bin']].merge(
                uttwer_df.loc[
                    (uttwer_df['reslm'] == reslm) &
                    (uttwer_df['latlm'] == latlm) &
                    (uttwer_df['mdl'] == mdl) &
                    (~uttwer_df['snr'].isnull())
                , ['utt', 'snr', 'wer']], on='utt'
            )
            df = agg_mean_by_lens(df, text_df['len'], 'wer', ['snr', 'perp_bin'])
            df['lwer'] = np.log(df['wer'])
            df = df.drop('wer', axis=1)
            for in_bin in range(num_bins - 1):
                df_in = df.loc[df['perp_bin'] == bin_cats[in_bin], ['snr', 'lwer']]
                for out_bin in range(in_bin + 1, num_bins):
                    df_out = df.loc[df['perp_bin'] == bin_cats[out_bin], ['snr', 'lwer']]
                    df_in_out = df_in.merge(df_out, on='snr', suffixes=('_in', '_out'))
                    if train:
                        fit = pg.linear_regression(df_in_out['lwer_out'], df_in_out['lwer_in'], add_intercept=False)
                        k, r2 = fit['coef'].iloc[0], fit['r2'].iloc[0]
                        train_fits[(*train_key, in_bin, out_bin)] = (k, r2)
                    else:
                        k, train_r2 = train_fits[(*train_key, in_bin, out_bin)]
                        y_true = df_in_out['lwer_in'].to_numpy()
                        y_pred = k * df_in_out['lwer_out'].to_numpy()
                        # FIXME(sdrobert): adjust?
                        r2_test = r2_score(y_true, y_pred)
                        test_fits[(*train_key, in_bin, out_bin, *key)] = r2_test

train_idx_names = ("train_mdl", "train_part", "train_perplm", "train_latlm", "train_reslm", "in_bin", "out_bin")
test_idx_names = train_idx_names + tuple(x.replace("train", "test") for x in train_idx_names[:-2])
train_fits = pd.DataFrame.from_dict(train_fits, orient='index', columns=['k', 'train_r2'])
train_fits.index = pd.MultiIndex.from_tuples(train_fits.index, names=train_idx_names)
test_fits = pd.DataFrame.from_dict(test_fits, orient='index', columns=['test_r2'])
test_fits.index = pd.MultiIndex.from_tuples(test_fits.index, names=test_idx_names)
test_fits.head()
fits = test_fits.join(train_fits, on=train_idx_names)
test_fits

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,test_r2
train_mdl,train_part,train_perplm,train_latlm,train_reslm,in_bin,out_bin,test_mdl,test_part,test_perplm,test_latlm,test_reslm,Unnamed: 12_level_1
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0,1,tdnn_1d_sp,dev-clean,fglarge,tgsmall,tgsmall,0.998796
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0,2,tdnn_1d_sp,dev-clean,fglarge,tgsmall,tgsmall,0.998211
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,1,2,tdnn_1d_sp,dev-clean,fglarge,tgsmall,tgsmall,0.995952
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0,1,tdnn_1d_sp,dev-other,tgsmall,tgsmall,tgsmall,0.999531
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0,2,tdnn_1d_sp,dev-other,tgsmall,tgsmall,tgsmall,0.995282
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,1,2,tdnn_1d_sp,dev-other,tgsmall,tgsmall,tgsmall,0.99652
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0,1,tdnn_1d_sp,dev-other,fglarge,tgsmall,tgsmall,0.998692
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0,2,tdnn_1d_sp,dev-other,fglarge,tgsmall,tgsmall,0.99788
tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,1,2,tdnn_1d_sp,dev-other,fglarge,tgsmall,tgsmall,0.997981
tdnn_1d_sp,dev-clean,fglarge,tgsmall,tgsmall,0,1,tdnn_1d_sp,dev-clean,tgsmall,tgsmall,tgsmall,0.998554
