In [None]:
!pip install matplotlib seaborn pandas

In [None]:
import pandas as pd

groups = ['clean', 'gibberish', 'noisy']

In [None]:
from collections import OrderedDict

def merge_dfs(metric, metric_column=None):
    metric_column = metric if metric_column is None else metric_column

    df_clean = pd.read_csv(f'results/{metric}_earswham_clean.csv').rename(columns={metric_column: f'{metric_column}_clean'})[['filename', f'{metric_column}_clean']]
    df_gibberish = pd.read_csv(f'results/{metric}_earswham_gibberish.csv').rename(columns={metric_column: f'{metric_column}_gibberish'})[['filename', f'{metric_column}_gibberish']]
    df_noisy = pd.read_csv(f'results/{metric}_earswham_noisy.csv').rename(columns={metric_column: f'{metric_column}_noisy'})[['filename', f'{metric_column}_noisy']]
    df_noisy['filename'] = df_noisy['filename'].str.split('_').str[0] + '.wav'

    df_metric = pd.merge(pd.merge(df_clean, df_noisy, on='filename'), df_gibberish, on='filename')
    df_metric.to_csv(f'results/{metric_column.rstrip("_log_ppl")}_earswham.csv', index=False)
    
    assert df_clean.shape[0] == df_metric.shape[0]

def concat_dfs(metric, metric_column=None):
    metric_column = metric if metric_column is None else metric_column

    df_gibberish = pd.read_csv(f'results/{metric}_earswham_gibberish.csv')[['filename', metric_column]]
    df_gibberish['subset'] = 'Gibberish'
    df_clean = pd.read_csv(f'results/{metric}_earswham_clean.csv')[['filename', metric_column]]
    df_clean['subset'] = 'Clean'
    df_noisy = pd.read_csv(f'results/{metric}_earswham_noisy.csv')[['filename', metric_column]]
    df_noisy['filename'] = df_noisy['filename'].str.split('_').str[0] + '.wav'
    df_noisy['subset'] = 'Noisy'

    df_metric = pd.concat([df_gibberish, df_clean, df_noisy], axis=0)
    df_metric.to_csv(f'results/{metric_column.rstrip("_log_ppl")}_earswham.csv', index=False)

metrics_dict = OrderedDict([
    ('twist_log_ppl', 'twist'),
    ('speechlmscore', 'speechlmscore'),
    ('parakeet_gpt2_log_ppl', 'parakeet_gpt2'),
    ('quartznet_gpt2_log_ppl', 'quartznet_gpt2'),
    ('distillmos', 'dnnmos'),
    ('utmosv2', 'dnnmos'),
])

for metric_column, metric in metrics_dict.items():
    concat_dfs(metric, metric_column)


In [None]:
import math
import matplotlib.pyplot as plt
import seaborn as sns

metrics_names = {
    'speechlmscore': 'SpeechLMScore (6) ↓', 
    'twist_log_ppl': 'TWIST log-perplexity ↓', 
    'parakeet_gpt2_log_ppl': 'Parakeet + GPT2 log-perplexity ↓', 
    'quartznet_gpt2_log_ppl': 'QuartzNet + GPT2 log-perplexity ↓',
    'utmosv2': 'UTMOSv2 ↑', 
    'distillmos': 'DistillMOS ↑',
}

ncols = 2
fig, axes = plt.subplots(nrows=math.ceil(len(metrics_dict)/ncols), ncols=ncols, figsize=(8, 10))

for i, metric_column in enumerate(metrics_dict.keys()):
    metric = metric_column.rstrip("_log_ppl")
    df = pd.read_csv(f'results/{metric}_earswham.csv')
    
    sns.boxplot(data=df, y=metric_column, hue="subset", ax=axes.ravel()[i])
    axes.ravel()[i].set(xlabel=metrics_names[metric_column])
    if metric_column in ('parakeet_gpt2_log_ppl', 'quartznet_gpt2_log_ppl'):
        axes.ravel()[i].set_ylim(0, 10)
    elif metric_column in ('distillmos', 'utmosv2'):
        axes.ravel()[i].set_ylim(0, 5)
    elif metric_column in ('twist_log_ppl', 'speechlmscore'):
        axes.ravel()[i].set_ylim(0, 3.5)

    if i != ncols - 1:
        axes.ravel()[i].get_legend().set_visible(False)
    else:
        axes.ravel()[i].get_legend().set_title(None)
    axes.ravel()[i].set(ylabel=None)

fig.tight_layout()
fig.savefig('boxplot.png', bbox_inches='tight', pad_inches=0.1)