In [None]:
import pandas as pd
import numpy as np

# exp_result_path_part1 = 'full_exp_llama8b_gran5_64_128.json'
# exp_result_path_part2 = 'full_exp_llama8b_gran5.json'

# exp_result_path_part1 = 'full_exp_llama8b_gran10_64_128.json'
# exp_result_path_part2 = 'full_exp_llama8b_gran10.json'

exp_result_path_part1 = 'full_exp_mistral_gran5_64_128.json'
exp_result_path_part2 = 'full_exp_mistral_gran5.json'

# exp_result_path_part1 = 'full_exp_mistral_gran10_64_128.json'
# exp_result_path_part2 = 'full_exp_mistral_gran10.json'


df1 = pd.read_json(exp_result_path_part1)
df2 = pd.read_json(exp_result_path_part2)
df = pd.concat([df1, df2], axis=0, ignore_index=True)
topics = np.unique(df['topic'])
df['topic_id'] = df['topic'].map(lambda x: np.where(topics == x)[0][0])
df.columns

In [None]:
import pandas as pd
from itertools import product

# The experiment varying configurations
few_shot_nrs = np.unique(df['few_shot_nr'])
max_tokens = np.unique(df['max_tokens'])
configurations = list(product(few_shot_nrs, max_tokens))

# results : {(few_shot_nr, max_tokens): df with index=(mean,std) and columns=feeatures}
results = {}

features = ['total_time_seconds', 'prompt_token_count',
            'output_token_count', 'total_tokens',
            'relevance_score', 'coherence_score', 'compliance_score']

for conf in configurations:
    df_conf = df[
        (df['few_shot_nr'] == conf[0]) &
        (df['max_tokens'] == conf[1])
    ]
    # aggregate rows by doing mean and std
    stats = df_conf[features].agg(['mean', 'std'])
    results[conf] = stats

# Flatten the results dict to a DataFrame
plot_data = []

for conf, stats in results.items():
    for feature in features:
        plot_data.append({
            'few_shot_nr': conf[0],
            'max_tokens': conf[1],
            'feature': feature,
            'mean': stats.loc['mean', feature],
            'std': stats.loc['std', feature]
        })

df_plot = pd.DataFrame(plot_data)

In [None]:
import matplotlib.pyplot as plt

def plot_metric_vs_few_shot(df_plot, metric, x='few_shot_nr', style='max_tokens', y_range=None, err=None):
    """
    Plots a metric (with mean Â± std) vs few-shot examples, separating lines by max_tokens.

    Parameters:
    - df_plot: DataFrame with columns ['few_shot_nr', 'max_tokens', 'feature', 'mean', 'std']
    - metric: str, the feature/metric to plot (e.g., 'total_time_seconds', 'relevance_score')
    - x: str, column to use for x-axis (default: 'few_shot_nr')
    - style: str, column to differentiate lines (default: 'max_tokens')
    """
    # Filter the metric
    df_metric = df_plot[df_plot['feature'] == metric]

    plt.figure(figsize=(10, 6))

    for val in sorted(df_metric[style].unique()):
        df_val = df_metric[df_metric[style] == val]
        
        plt.plot(
            df_val[x],
            df_val['mean'],
            marker='o',
            label=f"{style}={val}"
        )

        if y_range and isinstance(y_range, list):
          ax = plt.gca()
          ax.set_ylim(y_range)
        
        if err == 'band':
          plt.fill_between(
              df_val[x],
              df_val['mean'] - df_val['std'],
              df_val['mean'] + df_val['std'],
              alpha=0.2
          )
        elif err == 'bar':
          # Plot error bars instead of fill_between
          plt.errorbar(
              df_val[x],
              df_val['mean'],
              yerr=df_val['std'],
              fmt='none',      # no additional line, uses the existing plotted line
              capsize=5,       # length of the bar caps
              ecolor='black',  # optional: remove for default color
              alpha=0.8
          )

    plt.title(f"{metric.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} (means on 10 topics)")
    plt.xlabel(x.replace('_', ' ').title())
    plt.ylabel(metric.replace('_', ' ').title())
    plt.legend(title=style)
    plt.grid(True)
    plt.show()


In [None]:
plot_metric_vs_few_shot(df_plot, 'total_time_seconds', err='band')

In [None]:
plot_metric_vs_few_shot(df_plot, 'relevance_score', y_range=[7, 11])

In [None]:
plot_metric_vs_few_shot(df_plot, 'coherence_score', y_range=[8, 11])

In [None]:
plot_metric_vs_few_shot(df_plot, 'compliance_score', y_range=[7,11])

In [None]:
import seaborn as sns
df_rs = df_plot[df_plot['feature']=='relevance_score'].reset_index()
hm_rs = df_rs['mean'].to_numpy().reshape(4, -1)

df_cs = df_plot[df_plot['feature']=='coherence_score'].reset_index()
hm_cs = df_cs['mean'].to_numpy().reshape(4, -1)

df_comp_s = df_plot[df_plot['feature']=='compliance_score'].reset_index()
hm_comp_s = df_comp_s['mean'].to_numpy().reshape(4, -1)


In [None]:
import matplotlib.pyplot as plt

gran = 5
min = 3
cmap='viridis'

fig, axes = plt.subplots(1,3, figsize=(15, 4))

sns.heatmap(hm_rs, ax=axes[0], vmin=min, vmax=gran, cmap=cmap, 
            yticklabels=df_rs.loc[0::5,'few_shot_nr'],
            xticklabels=df_rs.loc[:5, 'max_tokens'])
sns.heatmap(hm_cs, ax=axes[1], vmin=min, vmax=gran, cmap=cmap, 
            yticklabels=df_cs.loc[0::5,'few_shot_nr'],
            xticklabels=df_cs.loc[:5, 'max_tokens'])
sns.heatmap(hm_comp_s, ax=axes[2], vmin=min, vmax=gran, cmap=cmap, 
            yticklabels=df_comp_s.loc[0::5,'few_shot_nr'],
            xticklabels=df_comp_s.loc[:5, 'max_tokens'])
axes[0].set_title('relevance score')
axes[0].set_ylabel('few shot nr')
axes[0].set_xlabel('max token len')

axes[1].set_title('coherence score')
axes[1].set_ylabel('few shot nr')
axes[1].set_xlabel('max token len')

axes[2].set_title('compliance score')
axes[2].set_ylabel('few shot nr')
axes[2].set_xlabel('max token len')
None

In [None]:
few_shot_examples_path = 'few_shot_examples_with_scores.json'

df = pd.read_json(few_shot_examples_path).describe()

In [None]:
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# few_shot_examples_path = 'llama_10_few_shot_examples_with_scores.json'
# few_shot_examples_w_err_path = 'llama_10_few_shot_examples_w_err_scores.json'

few_shot_examples_path = 'mistral_10_few_shot_examples_with_scores.json'
few_shot_examples_w_err_path = 'mistral_10_few_shot_examples_w_err_scores.json'

df = pd.read_json(few_shot_examples_path)
df_err = pd.read_json(few_shot_examples_w_err_path)

hm_rs = df['relevance_score'].to_numpy().reshape(2, -1)
hm_cs = df['coherence_score'].to_numpy().reshape(2, -1)
hm_comp_s = df['compliance_score'].to_numpy().reshape(2, -1)

hm_rs_err = df_err['relevance_score'].to_numpy().reshape(2, -1)
hm_cs_err = df_err['coherence_score'].to_numpy().reshape(2, -1)
hm_comp_s_err = df_err['compliance_score'].to_numpy().reshape(2, -1)

gran = 10
min = 1
cmap='viridis'

# first row contains few shot examples with errors generated by GPT5.1
# prove that the model can judge.
fig, axes = plt.subplots(2,3, figsize=(15, 8))

sns.heatmap(hm_rs, ax=axes[0,0], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_cs, ax=axes[0,1], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_comp_s, ax=axes[0,2], vmin=min, vmax=gran, cmap=cmap)

sns.heatmap(hm_rs_err, ax=axes[1,0], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_cs_err, ax=axes[1,1], vmin=min, vmax=gran, cmap=cmap)
sns.heatmap(hm_comp_s_err, ax=axes[1,2], vmin=min, vmax=gran, cmap=cmap)

axes[0,0].set_title('relevance score')
axes[0,1].set_title('coherence score')
axes[0,2].set_title('compliance score')
None