In [None]:
import pandas as pd
import numpy as np

exp_result_path = 'full_exp_llama8b_gran5.json'
df = pd.read_json(exp_result_path)
topics = np.unique(df['topic'])
df['topic_id'] = df['topic'].map(lambda x: np.where(topics == x)[0][0])
df.columns

In [None]:
import pandas as pd
from itertools import product

# The experiment varying configurations
few_shot_nrs = np.unique(df['few_shot_nr'])
max_tokens = np.unique(df['max_tokens'])
configurations = list(product(few_shot_nrs, max_tokens))

# results : {(few_shot_nr, max_tokens): df with index=(mean,std) and columns=feeatures}
results = {}

features = ['total_time_seconds', 'prompt_token_count',
            'output_token_count', 'total_tokens',
            'relevance_score', 'coherence_score', 'compliance_score']

for conf in configurations:
    df_conf = df[
        (df['few_shot_nr'] == conf[0]) &
        (df['max_tokens'] == conf[1])
    ]
    # aggregate rows by doing mean and std
    stats = df_conf[features].agg(['mean', 'std'])
    results[conf] = stats

# Flatten the results dict to a DataFrame
plot_data = []

for conf, stats in results.items():
    for feature in features:
        plot_data.append({
            'few_shot_nr': conf[0],
            'max_tokens': conf[1],
            'feature': feature,
            'mean': stats.loc['mean', feature],
            'std': stats.loc['std', feature]
        })

df_plot = pd.DataFrame(plot_data)
df_plot

In [None]:
import matplotlib.pyplot as plt

def plot_metric_vs_few_shot(df_plot, metric, x='few_shot_nr', style='max_tokens'):
    """
    Plots a metric (with mean ± std) vs few-shot examples, separating lines by max_tokens.

    Parameters:
    - df_plot: DataFrame with columns ['few_shot_nr', 'max_tokens', 'feature', 'mean', 'std']
    - metric: str, the feature/metric to plot (e.g., 'total_time_seconds', 'relevance_score')
    - x: str, column to use for x-axis (default: 'few_shot_nr')
    - style: str, column to differentiate lines (default: 'max_tokens')
    """
    # Filter the metric
    df_metric = df_plot[df_plot['feature'] == metric]

    plt.figure(figsize=(10, 6))

    for val in sorted(df_metric[style].unique()):
        df_val = df_metric[df_metric[style] == val]
        
        plt.plot(
            df_val[x],
            df_val['mean'],
            marker='o',
            label=f"{style}={val}"
        )
        
        plt.fill_between(
            df_val[x],
            df_val['mean'] - df_val['std'],
            df_val['mean'] + df_val['std'],
            alpha=0.2
        )

    plt.title(f"{metric.replace('_', ' ').title()} vs {x.replace('_', ' ').title()} (with ±STD)")
    plt.xlabel(x.replace('_', ' ').title())
    plt.ylabel(metric.replace('_', ' ').title())
    plt.legend(title=style)
    plt.grid(True)
    plt.show()


In [None]:
plot_metric_vs_few_shot(df_plot, 'total_time_seconds')

In [None]:
plot_metric_vs_few_shot(df_plot, 'relevance_score')

In [None]:
plot_metric_vs_few_shot(df_plot, 'coherence_score')

In [None]:
plot_metric_vs_few_shot(df_plot, 'compliance_score')