## Results Analysis

In [None]:
import pandas as pd
import numpy as np

llm_exp_result_path = 'llm_exp/full_exp_merged.json'
df_llm = pd.read_json(llm_exp_result_path)

tts_exp_kokoro_result_path = 'tts_exp/tts_exp_kokoro/full_results.json'
tts_exp_piper_result_path = 'tts_exp/tts_exp_piper/full_results.json'
df_kokoro = pd.read_json(tts_exp_kokoro_result_path)
df_piper = pd.read_json(tts_exp_piper_result_path)

assert np.all(df_kokoro.columns == df_piper.columns)

df_piper['inference_time'] = -df_piper['inference_time']
df_kokoro['inference_time'] = -df_kokoro['inference_time']

In [None]:
kokoro_durations = list(map(lambda x: x['duration_s'], df_kokoro['audio_metrics'].to_list()))
piper_durations = list(map(lambda x: x['duration_s'], df_piper['audio_metrics'].to_list()))
df_kokoro['durations'] = kokoro_durations
df_piper['durations'] = piper_durations
df_piper.columns
tts_metrics = ['wer', 'inference_time', 'durations']

In [None]:
df_kokoro = df_kokoro.rename(columns=lambda c: c + "_kokoro" if c in tts_metrics else c)
df_piper = df_piper.rename(columns=lambda c: c + "_piper" if c in tts_metrics else c)

In [None]:
df_kokoro.columns
df_piper.columns
tts_metrics_kokoro = list(map(lambda x: x+'_kokoro', tts_metrics))
tts_metrics_piper = list(map(lambda x: x+'_piper', tts_metrics))
df_llm[tts_metrics_kokoro] = df_kokoro[tts_metrics_kokoro]
df_llm[tts_metrics_piper] = df_piper[tts_metrics_piper]

In [None]:
df = df_llm

In [None]:
import pandas as pd
from itertools import product

# The experiment varying configurations
few_shot_nrs = np.unique(df['few_shot_nr'])
max_tokens = np.unique(df['max_tokens'])
configurations = list(product(few_shot_nrs, max_tokens))

# results : {(few_shot_nr, max_tokens): df with index=(mean,std) and columns=feeatures}
results = {}

########################################################################
# Individual experiments
# features = ['total_time_seconds', 'prompt_token_count',
#             'output_token_count', 'total_tokens',
#             'relevance_score', 'coherence_score', 'compliance_score']
########################################################################

########################################################################
# Merged experiments
features = ['total_time_seconds','prompt_token_count', 'output_token_count',
       'total_tokens', 'relevance_score_llama_g5', 'coherence_score_llama_g5',
       'compliance_score_llama_g5', 'relevance_score_llama_g10',
       'coherence_score_llama_g10', 'compliance_score_llama_g10',
       'relevance_score_mistral_g5', 'coherence_score_mistral_g5',
       'compliance_score_mistral_g5', 'relevance_score_mistral_g10',
       'coherence_score_mistral_g10', 'compliance_score_mistral_g10', 
       'wer_piper', 'inference_time_piper', 'durations_piper',
       'wer_kokoro', 'inference_time_kokoro', 'durations_kokoro']
########################################################################

for conf in configurations:
    df_conf = df[
        (df['few_shot_nr'] == conf[0]) &
        (df['max_tokens'] == conf[1])
    ]
    # aggregate rows by doing mean and std
    stats = df_conf[features].agg(['mean', 'std'])
    results[conf] = stats

# Flatten the results dict to a DataFrame
plot_data = []

for conf, stats in results.items():
    for feature in features:
        plot_data.append({
            'few_shot_nr': conf[0],
            'max_tokens': conf[1],
            'feature': feature,
            'mean': stats.loc['mean', feature],
            'std': stats.loc['std', feature]
        })

df_plot = pd.DataFrame(plot_data)

## Plots

In [None]:
# exp_identifier = '_mistral_g5'
# exp_identifier = '_mistral_g10'
# exp_identifier = '_llama_g5'
exp_identifier = '_llama_g10'

In [None]:
import matplotlib.pyplot as plt

def plot_metric_vs_few_shot(df_plot, metric, x='few_shot_nr', style='max_tokens', y_range=None, err=None):
    """
    Plots a metric (with mean Â± std) vs few-shot examples, separating lines by max_tokens.

    Parameters:
    - df_plot: DataFrame with columns ['few_shot_nr', 'max_tokens', 'feature', 'mean', 'std']
    - metric: str, the feature/metric to plot (e.g., 'total_time_seconds', 'relevance_score')
    - x: str, column to use for x-axis (default: 'few_shot_nr')
    - style: str, column to differentiate lines (default: 'max_tokens')
    """
    # Filter the metric
    df_metric = df_plot[df_plot['feature'] == metric]

    plt.figure(figsize=(10, 6))

    for val in sorted(df_metric[style].unique()):
        df_val = df_metric[df_metric[style] == val]
        
        plt.plot(
            df_val[x],
            df_val['mean'],
            marker='o',
            label=f"{style}={val}"
        )

        if y_range and isinstance(y_range, list):
          ax = plt.gca()
          ax.set_ylim(y_range)
        
        if err == 'band':
          plt.fill_between(
              df_val[x],
              df_val['mean'] - df_val['std'],
              df_val['mean'] + df_val['std'],
              alpha=0.2
          )
        elif err == 'bar':
          # Plot error bars instead of fill_between
          plt.errorbar(
              df_val[x],
              df_val['mean'],
              yerr=df_val['std'],
              fmt='none',      # no additional line, uses the existing plotted line
              capsize=5,       # length of the bar caps
              ecolor='black',  # optional: remove for default color
              alpha=0.8
          )

    plt.title(f"{metric.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} (means on 10 topics)")
    plt.xlabel(x.replace('_', ' ').title())
    plt.ylabel(metric.replace('_', ' ').title())
    plt.legend(title=style)
    plt.grid(True)
    plt.show()


In [None]:
plot_metric_vs_few_shot(df_plot, 'total_time_seconds', err='band')

In [None]:
plot_metric_vs_few_shot(df_plot, 'wer_kokoro', err='band')

In [None]:
plot_metric_vs_few_shot(df_plot, 'wer_piper')

In [None]:
plot_metric_vs_few_shot(df_plot, f'relevance_score{exp_identifier}', y_range=[2, 6])

In [None]:
plot_metric_vs_few_shot(df_plot, f'coherence_score{exp_identifier}', y_range=[1, 6])

In [None]:
plot_metric_vs_few_shot(df_plot, f'compliance_score{exp_identifier}', y_range=[1,6])

In [None]:
import seaborn as sns
df_rs = df_plot[df_plot['feature']==f'relevance_score{exp_identifier}'].reset_index()
hm_rs = df_rs['mean'].to_numpy().reshape(4, -1)

df_cs = df_plot[df_plot['feature']==f'coherence_score{exp_identifier}'].reset_index()
hm_cs = df_cs['mean'].to_numpy().reshape(4, -1)

df_comp_s = df_plot[df_plot['feature']==f'compliance_score{exp_identifier}'].reset_index()
hm_comp_s = df_comp_s['mean'].to_numpy().reshape(4, -1)


In [None]:
import matplotlib.pyplot as plt

gran = 10
min = 3
cmap='viridis'

fig, axes = plt.subplots(1,3, figsize=(15, 4))

sns.heatmap(hm_rs, ax=axes[0], vmin=min, vmax=gran, cmap=cmap, 
            yticklabels=df_rs.loc[0::5,'few_shot_nr'],
            xticklabels=df_rs.loc[:5, 'max_tokens'])
sns.heatmap(hm_cs, ax=axes[1], vmin=min, vmax=gran, cmap=cmap, 
            yticklabels=df_cs.loc[0::5,'few_shot_nr'],
            xticklabels=df_cs.loc[:5, 'max_tokens'])
sns.heatmap(hm_comp_s, ax=axes[2], vmin=min, vmax=gran, cmap=cmap, 
            yticklabels=df_comp_s.loc[0::5,'few_shot_nr'],
            xticklabels=df_comp_s.loc[:5, 'max_tokens'])
axes[0].set_title('relevance score')
axes[0].set_ylabel('few shot nr')
axes[0].set_xlabel('max token len')

axes[1].set_title('coherence score')
axes[1].set_ylabel('few shot nr')
axes[1].set_xlabel('max token len')

axes[2].set_title('compliance score')
axes[2].set_ylabel('few shot nr')
axes[2].set_xlabel('max token len')
None

## Few shot experiment

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# few_shot_examples_path = 'llama_10_few_shot_examples_with_scores.json'
# few_shot_examples_w_err_path = 'llama_10_few_shot_examples_w_err_scores.json'

few_shot_examples_path = 'mistral_10_few_shot_examples_with_scores.json'
few_shot_examples_w_err_path = 'mistral_10_few_shot_examples_w_err_scores.json'

df = pd.read_json(few_shot_examples_path)
df_err = pd.read_json(few_shot_examples_w_err_path)

hm_rs = df['relevance_score'].to_numpy().reshape(2, -1)
hm_cs = df['coherence_score'].to_numpy().reshape(2, -1)
hm_comp_s = df['compliance_score'].to_numpy().reshape(2, -1)

hm_rs_err = df_err['relevance_score'].to_numpy().reshape(2, -1)
hm_cs_err = df_err['coherence_score'].to_numpy().reshape(2, -1)
hm_comp_s_err = df_err['compliance_score'].to_numpy().reshape(2, -1)

gran = 10
min = 1
cmap='viridis'

# first row contains few shot examples with errors generated by GPT5.1
# prove that the model can judge.
fig, axes = plt.subplots(2,3, figsize=(15, 8))

sns.heatmap(hm_rs, ax=axes[0,0], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_cs, ax=axes[0,1], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_comp_s, ax=axes[0,2], vmin=min, vmax=gran, cmap=cmap)

sns.heatmap(hm_rs_err, ax=axes[1,0], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_cs_err, ax=axes[1,1], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_comp_s_err, ax=axes[1,2], vmin=min, vmax=gran, cmap=cmap)

axes[0,0].set_title('relevance score')
axes[0,1].set_title('coherence score')
axes[0,2].set_title('compliance score')
None

## Merge experiment results (Data processing)

### Merge exeriments scores on max_token_len : [256, 512, 1024]

In [None]:
import pandas as pd
import numpy as np

df_no_scores = pd.read_json('full_exp_no_scores.json')
df_llama_gran5 = pd.read_json('full_exp_llama8b_gran5.json')
df_llama_gran10 = pd.read_json('full_exp_llama8b_gran10.json')
df_mistral_gran5 = pd.read_json('full_exp_mistral_gran5.json')
df_mistral_gran10 = pd.read_json('full_exp_mistral_gran10.json')

In [None]:
# the 4 experiments contains the quality scores obtained for each 
# of the judge model tested and the two granularities tested (5,10)
assert np.all(df_llama_gran5.columns == df_llama_gran10.columns)
assert np.all(df_mistral_gran5.columns == df_llama_gran10.columns)
assert np.all(df_mistral_gran5.columns == df_mistral_gran10.columns)

# Sort all dataframes by the private key
key = ["few_shot_nr", "max_tokens"]

dfs = [df_no_scores, df_llama_gran5, df_llama_gran10, df_mistral_gran5, df_mistral_gran10]
for i in range(len(dfs)):
    dfs[i] = dfs[i].sort_values(key).reset_index(drop=True)

df_no_scores, df_llama_gran5, df_llama_gran10, df_mistral_gran5, df_mistral_gran10 = dfs

# Rename score columns
df_llama_gran5 = df_llama_gran5.rename(columns=lambda c: c + "_llama_g5" if "score" in c else c)
df_llama_gran10 = df_llama_gran10.rename(columns=lambda c: c + "_llama_g10" if "score" in c else c)
df_mistral_gran5 = df_mistral_gran5.rename(columns=lambda c: c + "_mistral_g5" if "score" in c else c)
df_mistral_gran10 = df_mistral_gran10.rename(columns=lambda c: c + "_mistral_g10" if "score" in c else c)

# Now safely add columns by index
df_no_scores[df_llama_gran5.filter(regex="_llama_g5$").columns] = df_llama_gran5.filter(regex="_llama_g5$").values
df_no_scores[df_llama_gran10.filter(regex="_llama_g10$").columns] = df_llama_gran10.filter(regex="_llama_g10$").values
df_no_scores[df_mistral_gran5.filter(regex="_mistral_g5$").columns] = df_mistral_gran5.filter(regex="_mistral_g5$").values
df_no_scores[df_mistral_gran10.filter(regex="_mistral_g10$").columns] = df_mistral_gran10.filter(regex="_mistral_g10$").values

### Merge exeriments scores on max_token_len : [64, 128]

In [None]:
import pandas as pd
import numpy as np

df_no_scores_64_128 = pd.read_json('full_exp_no_scores_64_128.json')
df_llama_gran5 = pd.read_json('full_exp_llama8b_gran5_64_128.json')
df_llama_gran10 = pd.read_json('full_exp_llama8b_gran10_64_128.json')
df_mistral_gran5 = pd.read_json('full_exp_mistral_gran5_64_128.json')
df_mistral_gran10 = pd.read_json('full_exp_mistral_gran10_64_128.json')

In [None]:
# the 4 experiments contains the quality scores obtained for each 
# of the judge model tested and the two granularities tested (5,10)
assert np.all(df_llama_gran5.columns == df_llama_gran10.columns)
assert np.all(df_mistral_gran5.columns == df_llama_gran10.columns)
assert np.all(df_mistral_gran5.columns == df_mistral_gran10.columns)

# Sort all dataframes by the private key
key = ["few_shot_nr", "max_tokens"]

dfs = [df_no_scores_64_128, df_llama_gran5, df_llama_gran10, df_mistral_gran5, df_mistral_gran10]
for i in range(len(dfs)):
    dfs[i] = dfs[i].sort_values(key).reset_index(drop=True)

df_no_scores_64_128, df_llama_gran5, df_llama_gran10, df_mistral_gran5, df_mistral_gran10 = dfs

# Rename score columns
df_llama_gran5 = df_llama_gran5.rename(columns=lambda c: c + "_llama_g5" if "score" in c else c)
df_llama_gran10 = df_llama_gran10.rename(columns=lambda c: c + "_llama_g10" if "score" in c else c)
df_mistral_gran5 = df_mistral_gran5.rename(columns=lambda c: c + "_mistral_g5" if "score" in c else c)
df_mistral_gran10 = df_mistral_gran10.rename(columns=lambda c: c + "_mistral_g10" if "score" in c else c)

# Now safely add columns by index
df_no_scores_64_128[df_llama_gran5.filter(regex="_llama_g5$").columns] = df_llama_gran5.filter(regex="_llama_g5$").values
df_no_scores_64_128[df_llama_gran10.filter(regex="_llama_g10$").columns] = df_llama_gran10.filter(regex="_llama_g10$").values
df_no_scores_64_128[df_mistral_gran5.filter(regex="_mistral_g5$").columns] = df_mistral_gran5.filter(regex="_mistral_g5$").values
df_no_scores_64_128[df_mistral_gran10.filter(regex="_mistral_g10$").columns] = df_mistral_gran10.filter(regex="_mistral_g10$").values

### Concatenate exeriments scores on [64, 128] and [256, 512, 1024]

In [None]:
# np.unique(df_no_scores['few_shot_nr'])
# np.unique(df_no_scores_64_128['few_shot_nr'])
# np.unique(df_no_scores['max_tokens'])
# np.unique(df_no_scores_64_128['max_tokens'])
final = pd.concat([df_no_scores_64_128, df_no_scores], ignore_index=True)

In [None]:
result = final.to_json('full_exp_merged.json',orient='records', indent=2)