## Preamble

In [1]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
import pingouin as pg
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go


from plotly.subplots import make_subplots
from scipy.special import logit, expit
from scipy.optimize import curve_fit
from analysis_utils import *

pio.renderers.default = "vscode"

## Perplexity

In [2]:
lms = ['tgsmall', 'tgmed', 'fglarge', 'rnnlm_lstm_1a']
parts = ['dev-clean', 'test-clean', 'dev-other', 'test-other']
perp_dfs = []
for lm in lms:
    perp_dfs_by_lm = [read_kaldi_table_as_df(f'ark:../exp/{lm}_perp_{part.replace("-", "_")}/perp', 'b', val='perp') for part in parts]
    perp_dfs.append(pd.concat(perp_dfs_by_lm, keys=parts, names=['part']))
perp_df = pd.concat(perp_dfs, keys=lms, names=['lm'])
perp_df = perp_df.reset_index().set_index('utt')
perp_df['ent'] = np.log(perp_df['perp'])
len_dfs = [read_kaldi_table_as_df(f'ark:../data/{part.replace("-", "_")}/text', "tv", val='len', apply=len) for part in parts]
len_df = pd.concat(len_dfs)
lm_df = perp_df.join(len_df).reset_index()
del perp_df, perp_dfs, len_dfs, len_df
lm_df.head(5)

Unnamed: 0,utt,lm,part,perp,ent,len
0,lbi-1089-134686-0000,tgsmall,test-clean,1017.947998,6.925544,28
1,lbi-1089-134686-0000,tgmed,test-clean,802.177979,6.687331,28
2,lbi-1089-134686-0000,fglarge,test-clean,971.969971,6.879325,28
3,lbi-1089-134686-0000,rnnlm_lstm_1a,test-clean,477.664001,6.168908,28
4,lbi-1089-134686-0001,tgsmall,test-clean,1465.718994,7.290101,8


In [3]:
df = lm_df.copy(deep=True)
len_df = df[df['lm'] == lms[0]][['part', 'len']].groupby('part').sum()
df['unnorm_ent'] = df['ent'] * df['len']
df = pd.pivot_table(df, values='unnorm_ent', index='part', columns='lm', aggfunc='sum')
df = df.div(len_df['len'], axis='index')
df

lm,fglarge,rnnlm_lstm_1a,tgmed,tgsmall
part,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dev-clean,5.021511,4.694738,5.518527,5.728285
dev-other,4.947523,4.643167,5.428913,5.633667
test-clean,5.064469,4.737677,5.551883,5.755167
test-other,4.98001,4.684319,5.463378,5.66365


In [4]:
fig = px.violin(
    lm_df, x='ent', y='part', color='lm',
    box=True,
    labels=dict(ent='Entropy (nats)', part='Partition', lm="LM"),
    width=600, height=800,
)
fig.show()

# g = sns.FacetGrid(lm_df, col='lm', col_wrap=2)
# g.map_dataframe(sns.violinplot, x='ent', y='part')

In [5]:
# highly non-normal..., we'll use spearman correlation
pg.normality(lm_df, dv='ent', group='lm', method='normaltest')

Unnamed: 0_level_0,W,pval,normal
lm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tgsmall,526.438305,4.845901e-115,False
tgmed,678.526111,4.5701149999999996e-148,False
fglarge,687.482364,5.189211e-150,False
rnnlm_lstm_1a,2680.772214,0.0,False


In [6]:
df = lm_df.pivot(values='ent', index='utt', columns='lm')
fig = px.scatter_matrix(df, dimensions=lms, opacity=0.1)
fig.show()

In [7]:
pg.pairwise_corr(df, columns=lms, alternative='greater', method='spearman')

Unnamed: 0,X,Y,method,alternative,n,r,CI95%,p-unc,power
0,tgsmall,tgmed,spearman,greater,11126,0.964522,"[0.96, 1.0]",0.0,1.0
1,tgsmall,fglarge,spearman,greater,11126,0.885817,"[0.88, 1.0]",0.0,1.0
2,tgsmall,rnnlm_lstm_1a,spearman,greater,11126,0.82759,"[0.82, 1.0]",0.0,1.0
3,tgmed,fglarge,spearman,greater,11126,0.918527,"[0.92, 1.0]",0.0,1.0
4,tgmed,rnnlm_lstm_1a,spearman,greater,11126,0.84892,"[0.84, 1.0]",0.0,1.0
5,fglarge,rnnlm_lstm_1a,spearman,greater,11126,0.87073,"[0.87, 1.0]",0.0,1.0


## Boothroyd's k

In [131]:
mdl = 'tdnn_1d_sp'
latlm = 'tgsmall'
reslm = 'tgsmall'
wer_df = read_best_wers_as_df()
wer_df = wer_df.loc[(wer_df['mdl'] == mdl) & (wer_df['latlm'] == latlm) & (wer_df['reslm'] == reslm)]
wer_df['acc'] = 1 - wer_df['wer']
wer_df.head(10)

Unnamed: 0,mdl,latlm,reslm,part,snr,perp_idx,perp_tot,path,wer,ins,del,sub,lmwt,wip,acc
0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,28.0,2,4,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0502,78,61,630,10,0.0,0.9498
1,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,27.0,2,3,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0532,111,86,900,10,0.0,0.9468
2,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,-3.0,4,5,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.855,108,4794,5277,7,0.0,0.145
3,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,17.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0924,357,627,4044,12,0.0,0.9076
4,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,3.0,2,5,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.543,205,1869,4306,10,0.0,0.457
5,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,18.0,2,5,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0769,74,99,731,11,0.0,0.9231
6,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,0.0,4,5,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.7346,196,3338,5211,10,0.0,0.2654
7,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,17.0,2,2,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.109,196,344,2279,11,0.5,0.891
8,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,17.0,3,5,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0862,64,131,885,11,0.5,0.9138
9,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,26.0,1,2,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.045,99,158,1026,12,0.0,0.955


In [132]:
perp_tot = 5
num_points = 100

df = wer_df.copy()
df['bin'] = 'avg.'
df = df.loc[df['perp_tot'] == perp_tot]

idx = df['snr'].isnull()
df, Ainvs = df.loc[~idx], df.loc[idx]
snr_min = df['snr'].min() - 1
snr_max = df['snr'].max() + 1
x_interp = np.linspace(snr_min, snr_max, num_points)
ratio = num_points // (perp_tot + 2)

def zhang_func(x : np.ndarray, A : float, B : float, C : float) -> np.ndarray:
    # Zhang et al (2023) "Estimate the noise effect on automatic speech recognition
    # accuracy for mandarin by an approach associating articulation index"
    # FIXME(sdrobert): the fit is very bad if we use eq. 12
    return 1 / (np.exp(-(x + B) / C) + A)


fig = go.Figure()
for perp_idx in range(1, perp_tot + 1):
    colour = px.colors.qualitative.Plotly[perp_idx - 1]
    df_ = df.loc[df['perp_idx'] == perp_idx]
    Ainv = Ainvs.loc[Ainvs['perp_idx'] == perp_idx, 'acc'].iloc[0]
    A_init = 1 / Ainv
    N = len(df_)
    print(f'bin={perp_idx:02d} (N={N})================')
    x = df_['snr'].array
    y = df_['acc'].array
    (A, B, C), _ = curve_fit(
        zhang_func, x, y,
        p0=(A_init, 0, 1),
        bounds=([1, -np.inf, 0.01], [np.inf, np.inf, np.inf]),
    )
    num = np.square(y - zhang_func(x, A, B, C), dtype=np.float64).sum()
    denom = np.square(y - y.mean()).sum()
    R_square = 1 - num / denom
    print(f"A={A:.02f},B={B:.02f},C={C:.02f},R^2={1 - num / denom:.04f}")
    print(f"=============================")
    y_interp = 1 / (A + np.exp(-(x_interp + B) / C))
    fig.add_scatter(
        x=x, y=df_['acc'] * 100,
        name=str(perp_idx), mode='markers',
        marker=dict(color=colour),
    )
    fig.add_scatter(
        x=x_interp, y=y_interp * 100,
        mode='lines',
        opacity=0.5,
        showlegend=False,
        line=dict(color=colour),
    )
    # fig.add_annotation(
    #     x=x_interp[ratio * perp_idx], y=y_interp[ratio * perp_idx] * 100,
    #     text=f"A={A:.02f},B={B:.02f},C={C:.02f}",
    #     showarrow=True,
    #     font=dict(color=colour),
    # )

fig.update_layout(
    xaxis_title="SNR (dB)",
    yaxis_title="acc. (%)",
    legend_title="bin",
    xaxis_tickformat='d',
    yaxis_tickformat='d',
    xaxis_range=[snr_min, snr_max],
    yaxis_range=[0, 100],
)
fig.show()


A=1.04,B=-2.64,C=3.83,R^2=0.9990
A=1.06,B=-3.60,C=3.97,R^2=0.9992
A=1.06,B=-3.47,C=4.00,R^2=0.9991
A=1.07,B=-4.11,C=4.04,R^2=0.9994
A=1.10,B=-4.70,C=4.01,R^2=0.9990


In [168]:
perp_tots = [2, 3, 4, 5]
num_points = 100
x_interp = np.linspace(0.01, 100, num_points)
ratio = num_points // (len(perp_tots) + 2)


df = wer_df.loc[(wer_df['perp_tot'] > 1)].copy()
df['context'] = 'in'
for perp_tot in perp_tots:
    idx = df['perp_tot'] == perp_tot
    in_idx = df['perp_idx'] == 1
    out_idx = df['perp_idx'] == perp_tot
    df.loc[idx & out_idx, 'context'] = 'out'
    df = df.loc[~idx | (idx & (in_idx | out_idx))]

df = pd.pivot_table(df, values='acc', index=['perp_tot', 'snr'], columns='context').reset_index()

fig_acc, fig_loge = go.Figure(), go.Figure()

# lines of equality
fig_acc.add_scatter(
    x=[0, 100], y=[0, 100],
    mode='lines', line_color='grey', line_dash='dash',
    showlegend=False,
)
fig_loge.add_scatter(
    x=[0, 100], y=[0, 100],
    mode='lines', line_color='grey', line_dash='dash',
    showlegend=False,
)

for i, perp_tot in enumerate(perp_tots):
    df_ = df.loc[df['perp_tot'] == perp_tot]
    x = np.log(1 - df_['out'])
    y = np.log(1 - df_['in'])
    N = len(df_)
    print(f'perp_tot={perp_tot:02d} (N={N})================')
    reg = pg.linear_regression(x, y, add_intercept=False)
    print(reg)
    k = reg['coef'].iloc[0]
    print(f"==================================")
    colour = px.colors.qualitative.Plotly[i]
    y_interp = 100 * (1 - (1 - x_interp / 100) ** k)
    fig_acc.add_scatter(
        x=df_['out'] * 100, y=df_['in'] * 100,
        name=str(perp_tot), mode='markers',
        marker=dict(color=colour),
    )
    fig_acc.add_scatter(
        x=x_interp, y=y_interp,
        showlegend=False, mode='lines', opacity=0.5,
        line=dict(color=colour),
    )
    fig_acc.add_annotation(
        x=x_interp[ratio * (i + 1)], y=y_interp[ratio * (i + 1)],
        text=f"k={k:.02f}",
        showarrow=True,
        opacity=1,
        font=dict(color=colour),
    )
    fig_loge.add_scatter(
        x=100 * (1 - df_['out']), y=100 * (1 - df_['in']),
        name=str(perp_tot), mode='markers',
        marker=dict(color=colour),
    )
    fig_loge.add_scatter(
        x=100 - x_interp, y=100 - y_interp,
        showlegend=False, mode='lines', opacity=0.5,
        line=dict(color=colour),
    )
    fig_loge.add_annotation(
        x=np.log10(100 - x_interp[ratio * (i + 1)]), y=np.log10(100 - y_interp[ratio * (i + 1)]),
        text=f"k={k:.02f}",
        showarrow=True,
        opacity=1,
        font=dict(color=colour),
    )

fig_acc.update_layout(
    xaxis_title="out-of-context accuracy (%)",
    yaxis_title="in-context accuracy (%)",
    legend_title="bins",
    xaxis_tickformat='d',
    yaxis_tickformat='d',
    xaxis_range=[0, 100],
    yaxis_range=[0, 100],
    width=800, height=400,
)
fig_acc.show()
fig_loge.update_layout(
    xaxis_title="out-of-context error rate (%)",
    yaxis_title="in-context error rate (%)",
    legend_title="bins",
    width=800, height=400,
)
lims = np.log10(99 - 100 * df_['out'].max()), np.log10(100)
fig_loge.update_xaxes(type='log', range=lims)
fig_loge.update_yaxes(type='log', range=lims)
fig_loge.show()


  names      coef        se           T          pval        r2    adj_r2  \
0   out  1.157563  0.001357  852.817981  8.029261e-87  0.999945  0.999944   

   CI[2.5%]  CI[97.5%]  
0   1.15482   1.160307  
  names      coef        se           T          pval        r2    adj_r2  \
0   out  1.248037  0.001894  658.940805  2.425262e-82  0.999908  0.999906   

   CI[2.5%]  CI[97.5%]  
0  1.244209   1.251865  
  names     coef        se           T          pval       r2    adj_r2  \
0   out  1.32986  0.002661  499.796132  1.535757e-77  0.99984  0.999836   

   CI[2.5%]  CI[97.5%]  
0  1.324483   1.335238  
  names      coef        se           T          pval        r2    adj_r2  \
0   out  1.407337  0.003126  450.154021  1.007574e-75  0.999803  0.999798   

   CI[2.5%]  CI[97.5%]  
0  1.401018   1.413655  


In [166]:
perp_tot = 5
num_points = 100
x_interp = np.linspace(0.01, 100, num_points)
ratio = num_points // (perp_tot + 2)

df = wer_df.loc[wer_df['perp_tot'] == perp_tot].copy()

df = pd.pivot_table(df, values='acc', index=['snr'], columns='perp_idx').reset_index()

fig_acc, fig_loge = go.Figure(), go.Figure()

# lines of equality
fig_acc.add_scatter(
    x=[0, 100], y=[0, 100],
    mode='lines', line_color='grey', line_dash='dash',
    showlegend=False,
)
fig_loge.add_scatter(
    x=[0, 100], y=[0, 100],
    mode='lines', line_color='grey', line_dash='dash',
    showlegend=False,
)

x = np.log(1 - df[perp_tot])

for i, perp_idx in enumerate(range(perp_tot - 1, 0, -1)):
    y = np.log(1 - df[perp_idx])
    print(f'perp_tot={perp_tot:02d}================')
    reg = pg.linear_regression(x, y, add_intercept=False)
    print(reg)
    k = reg['coef'].iloc[0]
    print(f"==================================")
    colour = px.colors.qualitative.Plotly[perp_idx]
    y_interp = 100 * (1 - (1 - x_interp / 100) ** k)
    fig_acc.add_scatter(
        x=df[perp_tot] * 100, y=df[perp_idx] * 100,
        name=str(perp_idx), mode='markers',
        marker=dict(color=colour),
    )
    fig_acc.add_scatter(
        x=x_interp, y=y_interp,
        showlegend=False, mode='lines', opacity=0.5,
        line=dict(color=colour),
    )
    fig_acc.add_annotation(
        x=x_interp[ratio * (i + 1)], y=y_interp[ratio * (i + 1)],
        text=f"k={k:.02f}",
        showarrow=True,
        opacity=1,
        font=dict(color=colour),
    )
    fig_loge.add_scatter(
        x=100 * (1 - df[perp_tot]), y=100 * (1 - df[perp_idx]),
        name=str(perp_idx), mode='markers',
        marker=dict(color=colour),
    )
    fig_loge.add_scatter(
        x=100 - x_interp, y=100 - y_interp,
        showlegend=False, mode='lines', opacity=0.5,
        line=dict(color=colour),
    )
    fig_loge.add_annotation(
        x=100 - x_interp[ratio * (i + 1)], y=100 - y_interp[ratio * (i + 1)],
        text=f"k={k:.02f}",
        showarrow=True,
        opacity=1,
        font=dict(color=colour),
    )

fig_acc.update_layout(
    xaxis_title="out-of-context accuracy (%)",
    yaxis_title="in-context accuracy (%)",
    legend_title="idx",
    xaxis_tickformat='d',
    yaxis_tickformat='d',
    xaxis_range=[0, 100],
    yaxis_range=[0, 100],
    width=800, height=400,
)
fig_acc.show()
fig_loge.update_layout(
    xaxis_title="out-of-context error rate (%)",
    yaxis_title="in-context error rate (%)",
    legend_title="idx",
    width=800, height=400,
)
lims = np.log10(99 - 100 * df[perp_tot].max()), np.log10(100)
fig_loge.update_xaxes(type='log', range=lims)
fig_loge.update_yaxes(type='log', range=lims)
fig_loge.show()


   names      coef        se          T          pval        r2    adj_r2  \
0      5  1.132644  0.001399  809.39583  6.492914e-86  0.999939  0.999937   

   CI[2.5%]  CI[97.5%]  
0  1.129815   1.135472  
   names      coef        se           T          pval        r2    adj_r2  \
0      5  1.235297  0.002525  489.155314  3.631679e-77  0.999833  0.999829   

   CI[2.5%]  CI[97.5%]  
0  1.230193   1.240401  
   names      coef        se          T          pval        r2    adj_r2  \
0      5  1.233052  0.001587  776.86654  3.349479e-85  0.999934  0.999932   

   CI[2.5%]  CI[97.5%]  
0  1.229844    1.23626  
   names      coef        se           T          pval        r2    adj_r2  \
0      5  1.407337  0.003126  450.154021  1.007574e-75  0.999803  0.999798   

   CI[2.5%]  CI[97.5%]  
0  1.401018   1.413655  


In [139]:
#
uttwer_df = read_best_uttwers_as_df()
uttwer_df = wer_df.loc[wer_df['perp_tot'] == 1].merge(uttwer_df, on='path', suffixes=('_tot', ''))
uttwer_df = uttwer_df.merge(lm_df.loc[lm_df['lm'] == 'rnnlm_lstm_1a'], on=['part', 'utt'])
uttwer_df.head(10)

Unnamed: 0,mdl,latlm,reslm,part,snr,perp_idx,perp_tot,path,wer_tot,ins,...,sub,lmwt,wip,acc,utt,wer,lm,perp,ent,len
0,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,17.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0924,357,...,4044,12,0.0,0.9076,lbi-1272-128104-0000,0.0,rnnlm_lstm_1a,87.063599,4.466639,17
1,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,-8.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.9715,23,...,7067,7,0.0,0.0285,lbi-1272-128104-0000,100.0,rnnlm_lstm_1a,87.063599,4.466639,17
2,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,20.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0727,341,...,3213,11,0.0,0.9273,lbi-1272-128104-0000,5.88,rnnlm_lstm_1a,87.063599,4.466639,17
3,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,3.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.5443,940,...,18880,11,0.0,0.4557,lbi-1272-128104-0000,58.82,rnnlm_lstm_1a,87.063599,4.466639,17
4,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,-4.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.8788,325,...,20275,7,0.0,0.1212,lbi-1272-128104-0000,100.0,rnnlm_lstm_1a,87.063599,4.466639,17
5,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,16.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.1029,423,...,4527,11,0.0,0.8971,lbi-1272-128104-0000,17.65,rnnlm_lstm_1a,87.063599,4.466639,17
6,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,-9.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.9833,13,...,4671,7,0.0,0.0167,lbi-1272-128104-0000,100.0,rnnlm_lstm_1a,87.063599,4.466639,17
7,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,21.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.0687,330,...,3034,11,0.0,0.9313,lbi-1272-128104-0000,5.88,rnnlm_lstm_1a,87.063599,4.466639,17
8,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,2.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.6015,1029,...,21528,10,0.0,0.3985,lbi-1272-128104-0000,76.47,rnnlm_lstm_1a,87.063599,4.466639,17
9,tdnn_1d_sp,tgsmall,tgsmall,dev-clean,-5.0,1,1,../exp/chain_cleaned/tdnn_1d_sp/decode_tgsmall...,0.9095,208,...,17099,7,0.0,0.0905,lbi-1272-128104-0000,100.0,rnnlm_lstm_1a,87.063599,4.466639,17


In [140]:
df = uttwer_df.loc[uttwer_df['snr'].isnull()].copy()
x_interp = np.linspace(df['ent'].min(), df['ent'].max(), 100)

fig = px.scatter(df, x='ent', y='wer')
fig.update_layout(
    yaxis_range=[0, 30],
    xaxis_title='entropy (nats)',
    yaxis_title='error rate (%)',
)
fig.show()