## Results Analysis

In [None]:
import pandas as pd
import numpy as np

llm_exp_result_path = 'llm_exp/full_exp_merged.json'
df_llm = pd.read_json(llm_exp_result_path)

tts_exp_kokoro_result_path = 'tts_exp/tts_exp_l4_kokoro/full_results.json'
tts_exp_piper_result_path = 'tts_exp/tts_exp_l4_piper/full_results.json'
df_kokoro = pd.read_json(tts_exp_kokoro_result_path)
df_piper = pd.read_json(tts_exp_piper_result_path)

assert np.all(df_kokoro.columns == df_piper.columns)

# df_piper['inference_time'] = -df_piper['inference_time']
# df_kokoro['inference_time'] = -df_kokoro['inference_time']

In [None]:
kokoro_durations = list(map(lambda x: x['duration_s'], df_kokoro['audio_metrics'].to_list()))
piper_durations = list(map(lambda x: x['duration_s'], df_piper['audio_metrics'].to_list()))
df_kokoro['audio_duration'] = kokoro_durations
df_piper['audio_duration'] = piper_durations
df_piper.columns
tts_metrics = ['wer', 'inference_time', 'audio_duration']

In [None]:
df_kokoro = df_kokoro.rename(columns=lambda c: c + "_kokoro" if c in tts_metrics else c)
df_piper = df_piper.rename(columns=lambda c: c + "_piper" if c in tts_metrics else c)

In [None]:
df_llm.columns

In [None]:
df_kokoro.columns
df_piper.columns
tts_metrics_kokoro = list(map(lambda x: x+'_kokoro', tts_metrics))
tts_metrics_piper = list(map(lambda x: x+'_piper', tts_metrics))
df_llm[tts_metrics_kokoro] = df_kokoro[tts_metrics_kokoro]
df_llm[tts_metrics_piper] = df_piper[tts_metrics_piper]

In [None]:
df_llm['inference_time_pipeline_piper'] = df_llm['inference_seconds'] + df_llm['inference_time_piper']
df_llm['inference_time_pipeline_kokoro'] = df_llm['inference_seconds'] + df_llm['inference_time_kokoro']

In [None]:
df = df_llm

In [None]:
import pandas as pd
from itertools import product

# The experiment varying configurations
few_shot_nrs = np.unique(df['few_shot_nr'])
max_tokens = np.unique(df['max_tokens'])
configurations = list(product(few_shot_nrs, max_tokens))

# results : {(few_shot_nr, max_tokens): df with index=(mean,std) and columns=feeatures}
results = {}

features = ['prompt_token_count', 'output_token_count', 'inference_seconds',
       'total_tokens', 'relevance_score_llama_g5', 'coherence_score_llama_g5',
       'compliance_score_llama_g5', 'relevance_score_llama_g10',
       'coherence_score_llama_g10', 'compliance_score_llama_g10',
       'relevance_score_mistral_g5', 'coherence_score_mistral_g5',
       'compliance_score_mistral_g5', 'relevance_score_mistral_g10',
       'coherence_score_mistral_g10', 'compliance_score_mistral_g10', 
       'wer_piper', 'inference_time_piper', 'audio_duration_piper',
       'inference_time_pipeline_piper',
       'wer_kokoro', 'inference_time_kokoro', 'audio_duration_kokoro',
       'inference_time_pipeline_kokoro']

for conf in configurations:
    df_conf = df[
        (df['few_shot_nr'] == conf[0]) &
        (df['max_tokens'] == conf[1])
    ]
    # aggregate rows by doing mean and std
    stats = df_conf[features].agg(['mean', 'std'])
    results[conf] = stats

# Flatten the results dict to a DataFrame
plot_data = []

for conf, stats in results.items():
    for feature in features:
        plot_data.append({
            'few_shot_nr': conf[0],
            'max_tokens': conf[1],
            'feature': feature,
            'mean': stats.loc['mean', feature],
            'std': stats.loc['std', feature]
        })

df_plot = pd.DataFrame(plot_data)

In [None]:
np.unique(df_plot.feature)

## Plots

In [None]:
import matplotlib.pyplot as plt

def plot_metric_vs_few_shot(
    df_plot,
    metric,
    x='few_shot_nr',
    style='max_tokens',
    ax=None,
    y_range=None,
    err=None,
    title_addon="(means on 10 topics)"
):
    # Filter the metric
    df_metric = df_plot[df_plot['feature'] == metric]

    if ax is None:
        _, ax = plt.subplots(figsize=(10, 6))

    for val in sorted(df_metric[style].unique()):
        df_val = df_metric[df_metric[style] == val]

        ax.plot(
            df_val[x],
            df_val['mean'],
            marker='o',
            label=f"{style}={val}"
        )

        if err == 'band':
            ax.fill_between(
                df_val[x],
                df_val['mean'] - df_val['std'],
                df_val['mean'] + df_val['std'],
                alpha=0.2
            )

        elif err == 'bar':
            ax.errorbar(
                df_val[x],
                df_val['mean'],
                yerr=df_val['std'],
                fmt='none',
                capsize=5,
                alpha=0.8
            )

    if y_range is not None:
        ax.set_ylim(y_range)

    ax.set_title(
        f"{metric.replace('_', ' ').title()} vs "
        f"{x.replace('_', ' ').title()} {title_addon}"
    )
    ax.set_xlabel(x.replace('_', ' ').title())
    ax.set_ylabel(metric.replace('_', ' ').title())
    ax.legend(title=style)
    ax.grid(True)

    return ax


In [None]:
ax = plot_metric_vs_few_shot(df_plot, x='max_tokens', style='few_shot_nr', metric='audio_duration_piper')
wpm = 160
target_words = 200
target_duration_s = target_words/wpm * 60
ax.plot([0, 1024], [target_duration_s, target_duration_s], linestyle='--', c='black')

In [None]:
plot_metric_vs_few_shot(df_plot, 'inference_seconds', err='band')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15,5))
plot_metric_vs_few_shot(df_plot, 'inference_time_pipeline_kokoro', err='band', ax=axes[0], title_addon="")
plot_metric_vs_few_shot(df_plot, 'inference_time_pipeline_piper', err='band', ax=axes[1], title_addon="")

In [None]:
plot_metric_vs_few_shot(df_plot, 'wer_kokoro')

## LLM judge scores

In [None]:
import seaborn as sns
judge_identifier = '_mistral_g5'
# judge_identifier = '_mistral_g10'
# judge_identifier = '_llama_g5'
# judge_identifier = '_llama_g10'

y_range = [1, 6]
cmap='viridis'

def row_heat_map(judge_identifier, y_range, row_idx):
  df_rs = df_plot[df_plot['feature']==f'relevance_score{judge_identifier}'].reset_index()
  hm_rs = df_rs['mean'].to_numpy().reshape(4, -1)

  df_cs = df_plot[df_plot['feature']==f'coherence_score{judge_identifier}'].reset_index()
  hm_cs = df_cs['mean'].to_numpy().reshape(4, -1)

  df_comp_s = df_plot[df_plot['feature']==f'compliance_score{judge_identifier}'].reset_index()
  hm_comp_s = df_comp_s['mean'].to_numpy().reshape(4, -1)

  sns.heatmap(hm_rs, ax=axes[row_idx,0], vmin=y_range[0], vmax=y_range[1], cmap=cmap, 
            yticklabels=df_rs.loc[0::5,'few_shot_nr'],
            xticklabels=df_rs.loc[:4, 'max_tokens'])
  sns.heatmap(hm_cs, ax=axes[row_idx,1], vmin=y_range[0], vmax=y_range[1], cmap=cmap, 
              yticklabels=df_cs.loc[0::5,'few_shot_nr'],
              xticklabels=df_cs.loc[:4, 'max_tokens'])
  sns.heatmap(hm_comp_s, ax=axes[row_idx,2], vmin=y_range[0], vmax=y_range[1], cmap=cmap, 
              yticklabels=df_comp_s.loc[0::5,'few_shot_nr'],
              xticklabels=df_comp_s.loc[:4, 'max_tokens'])
  
  axes[row_idx,0].set_title('relevance score')
  axes[row_idx,0].set_ylabel('few shot nr')
  axes[row_idx,0].set_xlabel('max token len')

  axes[row_idx,1].set_title('coherence score')
  axes[row_idx,1].set_ylabel('few shot nr')
  axes[row_idx,1].set_xlabel('max token len')

  axes[row_idx,2].set_title('compliance score')
  axes[row_idx,2].set_ylabel('few shot nr')
  axes[row_idx,2].set_xlabel('max token len')

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15,20))
# plot_metric_vs_few_shot(df_plot, f'relevance_score{judge_identifier}', y_range=y_range, ax=axes[0,0], title_addon="")
# plot_metric_vs_few_shot(df_plot, f'coherence_score{judge_identifier}', y_range=y_range, ax=axes[0,1], title_addon="")
# plot_metric_vs_few_shot(df_plot, f'compliance_score{judge_identifier}', y_range=y_range, ax=axes[0,2], title_addon="")
gran_5_range = [3,5]
gran_10_range = [7,10]
row_heat_map(judge_identifier="_mistral_g10", y_range=gran_5_range, row_idx=0)
row_heat_map(judge_identifier="_llama_g5", y_range=gran_5_range, row_idx=1)
row_heat_map(judge_identifier="_mistral_g10", y_range=gran_10_range, row_idx=2)
row_heat_map(judge_identifier="_llama_g10", y_range=gran_10_range, row_idx=3)



None