# Analyzing how coherence scales with model size

## Importing libraries and loading data

In [1]:
from src.analyzer import Analyzer
from src.visualizer import VisualisationConfig, visualize
from src.metrics import pairwise_bce_of_group, nbce_of_group, cbc_of_group, scs_of_group, rbc_of_group
import altair as alt
import numpy as np
import pandas as pd

alt.data_transformers.enable("vegafusion")

logprob_data_paths = ["data/logprobs.csv"]

# merge logprobs.csv and pythia_logprobs.csv

logprobs = [pd.read_csv(logprob_data_path) for logprob_data_path in logprob_data_paths]


# merge the two dataframes
logprobs = pd.concat(logprobs)
logprobs.reset_index(drop=True, inplace=True)


data_analyzer = Analyzer(logprobs)
data_analyzer.df

In [2]:
data_analyzer.df.columns

## Renaming columns, models and adding model family and size columns

In [None]:
np.unique(data_analyzer.df["model_name"]), np.unique(data_analyzer.df["model_kwargs"])

In [4]:
# evals from open-llm-leaderboard
evals_df = pd.read_parquet(
    "hf://datasets/open-llm-leaderboard/contents/data/train-00000-of-00001.parquet"
)
evals_df.columns

In [5]:
data_analyzer.rename({"model_name": "Language Model", "model_kwargs": "Training Steps"})

params_lookup = (
    
    evals_df.drop_duplicates(subset=["fullname"])
    .set_index("fullname")["#Params (B)"]
    .to_dict()
)

# Add the '#Params (B)' column to data_analyzer.df
data_analyzer.add_column(
    column_name="#Params (B)",
    column_spec=lambda df: df["Language Model"]
    .map(params_lookup)
    .astype(float),  # Ensure it's float for numerical operations
)

selected_evals = [
    "IFEval",
    "BBH",
    "MATH Lvl 5",
    "GPQA",
    "MUSR",
    "MMLU-PRO",
    "MMLU-PRO",
    "Average ⬆️",
]

# Create a lookup dictionary once for efficiency
eval_lookup = {}
for model_name in evals_df["fullname"].unique():
    eval_lookup[model_name] = {
        eval_name: evals_df[evals_df["fullname"] == model_name][eval_name].iloc[0]
        for eval_name in selected_evals
        if not evals_df[evals_df["fullname"] == model_name].empty
    }

# Add columns using direct dictionary lookups
for eval in selected_evals:
    data_analyzer.add_column(
        column_name=eval,
        column_spec=lambda df, eval_name=eval: df["Language Model"].map(
            lambda model: eval_lookup.get(model, {}).get(eval_name, np.nan)
        ),
    )

data_analyzer.add_column(
    column_name="Model Family",
    column_spec=(
        {"Language Model": lambda model_name: model_name.split("/")[1].split("-")[0]}
    ),
)


def format_model_kwargs(kwarg_str):
    if '"revision": "step' in kwarg_str:
        # Extract the step number
        step = kwarg_str.split("step")[1].split('"')[0]
        # Convert to k format (e.g., 33000 -> 33k)
        step_k = str(int(step) // 1000) + "k"
        return step_k
    else:
        return "unknown"
    
    

data_analyzer.filter({"Model Family": ["Llama", "gpt2", "pythia", "Qwen2.5", "Falcon3"]})

data_analyzer.rename(
    {
        "Model Family": {
            "Llama": "Llama 3",
            "gpt2": "GPT 2",
            "pythia": "Pythia",
            "Qwen2.5": "Qwen 2.5",
        },
        "Language Model": lambda model_name: model_name.split("/")[1],
        "Training Steps": format_model_kwargs,
    }
)

data_analyzer.df

## Calculating All Coherence Metrics

We calculate all coherence metrics systematically using the same grouping strategy. We always group by evidence_text since different class types have different class and evidence elicitation prefixes.

The calculate_metric functions return Analyzer objects rather than modifying the dataframe in place, allowing us to chain operations.

In [7]:
# Define all metrics to calculate
metrics_config = [
    {"name": "BCE", "func": pairwise_bce_of_group, "kwargs": {"square": True}, "display_name": "BCE (Pairwise MSE)"},
    {"name": "NBCE", "func": nbce_of_group, "kwargs": {}, "display_name": "NBCE"},
    {"name": "CBC", "func": cbc_of_group, "kwargs": {}, "display_name": "CBC"},
    {"name": "SCS", "func": scs_of_group, "kwargs": {}, "display_name": "SCS"},
    {"name": "RBC", "func": rbc_of_group, "kwargs": {}, "display_name": "RBC"}
]

# Calculate all metrics using the same structure
analyzers = {}
group_by_cols = [
    "evidence_text",
    "class_category",
    "Language Model", 
    "Training Steps",
    "conversation_history"
]

sort_config = {
    "#Params (B)": lambda x: x,
    "Training Steps": ["33k", "66k", "99k", "143k", "unknown"],
}

for metric_config in metrics_config:
    analyzer = data_analyzer.calculate_metric(
        metric_name=metric_config["display_name"],
        metric_func=metric_config["func"],
        group_by_cols=group_by_cols,
        log_prior_col="prior_logprob",
        log_likelihood_col="likelihood_logprob",
        log_posterior_col="posterior_logprob",
        inherit_identical_values=True,
        **metric_config["kwargs"]
    )
    
    analyzer.sort(sort_config)
    analyzers[metric_config["name"]] = analyzer

print("Calculated metrics:")
for name, analyzer in analyzers.items():
    print(f"{name}: {analyzer.df.shape}")

# For backwards compatibility
bce_mse_analyzer = analyzers["BCE"]
bce_mse_analyzer.df.columns

## Calculate Mean Values for All Metrics


In [8]:
# Calculate mean values for all metrics systematically
mean_analyzers = {}

for metric_config in metrics_config:
    name = metric_config["name"]
    display_name = metric_config["display_name"]
    
    mean_analyzer = analyzers[name].calculate_metric(
        metric_func="mean",
        group_by_cols=["Language Model"],
        metric_col=display_name,
        metric_name=f"Mean {display_name}",
        inherit_identical_values=True,
    )
    
    mean_analyzers[name] = mean_analyzer

print("Mean analyzers created:")
for name, analyzer in mean_analyzers.items():
    print(f"{name}: {analyzer.df.shape}")

# For backwards compatibility
mean_bce_mse_analyzer = mean_analyzers["BCE"]


In [30]:
analyzers['NBCE'].df

## Systematic Visualization of All Metrics

Create consistent visualizations for all metrics using the visualizer.


In [9]:
# Create visualization for each metric using the visualizer
for metric_config in metrics_config:
    name = metric_config["name"]
    display_name = metric_config["display_name"]
    analyzer = analyzers[name]
    
    config = VisualisationConfig(
        plot_fn=alt.Chart.mark_line,
        fig_title=f"{name} by Model",
        x_category="Language Model:N",
        y_category=f"mean({display_name}):Q",
        color_category="Training Steps:N",
        facet_category="Model Family:N",
        facet_columns=10,
        tooltip_fields=[
            alt.Tooltip("Language Model:N", title="Model"),
            alt.Tooltip("Model Family:N", title="Family"),
            alt.Tooltip(f"mean({display_name}):Q", title=f"Mean {name}", format=".3f"),
            alt.Tooltip("median():Q", title="Median", format=".3f"),
            alt.Tooltip("count():Q", title="Count", format="d"),
        ],
        titles={
            f"mean({display_name}):Q": f"Mean {name}",
        },
        chart_properties={"resolve": {"scale": {"x": "independent", "y": "shared"}}},
        legend_config={"columns": 5, "orient": "bottom"},
    )
    
    print(f"\\n=== {name} ===")
    chart = visualize(analyzer.df, config=config)
    chart.show()


## Calculate mean values for all metrics


In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Combine all mean metrics into a single dataframe for comparison
combined_data = []

metric_cols = {
    "BCE": "Mean BCE (Pairwise MSE)",
    "NBCE": "Mean NBCE", 
    "CBC": "Mean CBC",
    "SCS": "Mean SCS",
    "RBC": "Mean RBC"
}

# Create a combined dataset
for metric_name, analyzer in mean_analyzers.items():
    df_copy = analyzer.df.copy()
    df_copy["Metric"] = metric_name
    df_copy["Value"] = df_copy[metric_cols[metric_name]]
    combined_data.append(df_copy[["Language Model", "Model Family", "#Params (B)", "Metric", "Value"]])

combined_df = pd.concat(combined_data, ignore_index=True)

# Filter out unknown training steps for cleaner comparison  
combined_df = combined_df.dropna(subset=["Value"])

print("Combined dataframe shape:", combined_df.shape)
print("\\nMetrics included:", combined_df["Metric"].unique())
print("\\nModels included:", combined_df["Language Model"].unique())

# Display sample of the combined data
combined_df.head(10)


## Define a visualization config and visualize the data

src.visualizer provides a very simple interface for quickly visualizing lots of data by describing which column to map to which plotting element.

In [14]:
config = VisualisationConfig(
    plot_fn=alt.Chart.mark_line,
    fig_title="BCE (Pairwise MSE method) by Model",
    x_category="Language Model:N",
    y_category="mean(BCE (Pairwise MSE)):Q",
    color_category="Training Steps:N",
    facet_category="Model Family:N",
    facet_columns=10,
    tooltip_fields=[
        alt.Tooltip("Language Model:N", title="Model"),
        alt.Tooltip("Model Family:N", title="Family"),
        alt.Tooltip("mean(BCE (Pairwise MSE)):Q", title="Mean BCE", format=".3f"),
        alt.Tooltip("median():Q", title="Median", format=".3f"),
        alt.Tooltip("mean():Q", title="Mean", format=".3f"),
        alt.Tooltip("count():Q", title="Count", format="d"),
    ],
    titles={
        "mean(BCE (Pairwise MSE)):Q": "Mean BCE (Pairwise MSE method)",
        "model_name": "Language Model",
    },
    chart_properties={"resolve": {"scale": {"x": "independent", "y": "shared"}}},
    legend_config={"columns": 5, "orient": "bottom"},
)

chart_mse = visualize(bce_mse_analyzer.df, config=config)

chart_mse.show()

## Plotting All Metrics by Model Size

Filter all metrics to keep only fully trained models (remove intermediate Pythia checkpoints).


In [15]:
# Filter all analyzers to keep only fully trained models
for name, analyzer in analyzers.items():
    analyzer.filter({"Training Steps": ["unknown", "143k"]})

for name, analyzer in mean_analyzers.items():
    analyzer.filter({"Training Steps": ["unknown", "143k"]})

print("Training steps after filtering:")
for name, analyzer in analyzers.items():
    print(f"{name}: {analyzer.df['Training Steps'].unique()}")

# For backwards compatibility
bce_mse_analyzer = analyzers["BCE"]

In [16]:
# Create GPQA plots for all metrics
for metric_config in metrics_config:
    name = metric_config["name"]
    display_name = metric_config["display_name"]
    analyzer = analyzers[name]
    
    config = VisualisationConfig(
        plot_fn=alt.Chart.mark_point,
        fig_title=f"{name} vs GPQA Performance",
        x_category="GPQA:Q",
        y_category=f"mean({display_name}):Q",
        color_category="Model Family:N",
        tooltip_fields=[
            alt.Tooltip("Language Model:N", title="Model"),
            alt.Tooltip("Model Family:N", title="Family"),
            alt.Tooltip(f"mean({display_name}):Q", title=f"Mean {name}", format=".3f"),
            alt.Tooltip("GPQA:Q", title="GPQA", format=".3f"),
            alt.Tooltip("median():Q", title="Median", format=".3f"),
            alt.Tooltip("count():Q", title="Count", format="d"),
        ],
        titles={
            f"mean({display_name}):Q": f"Mean {name}",
        },
        scale={
            f"mean({display_name}):Q": {"zero": False},
        },
        chart_properties={
            "resolve": {"scale": {"x": "independent", "y": "shared"}},
        },
        legend_config={"columns": 5, "orient": "bottom"},
    )
    
    print(f"\\n=== {name} vs GPQA ===")
    chart = visualize(analyzer.df, config=config)
    chart.show()

## Category Analysis: Distribution Across Groups

Show how different class categories affect coherence for each metric across model families.

In [19]:
# Create systematic visualization and analysis plots for ALL metrics
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Function to fit log-linear model
def log_linear_fit(data, x_col, y_col, model_family=None):
    if model_family:
        data = data[data["Model Family"] == model_family]

    x = np.log10(data[x_col])  # Log transform x values
    y = data[y_col]

    # Fit linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

    # Calculate line
    x_range = np.linspace(min(x), max(x), 100)
    y_range = slope * x_range + intercept

    # Transform x back to original scale for plotting
    x_range_original = 10**x_range

    return x_range_original, y_range, r_value**2, slope

# Create comprehensive plots for all metrics
print("=== SYSTEMATIC PLOTTING FOR ALL METRICS ===")

# 1. Model size scaling plots with lines of best fit
for metric_config in metrics_config:
    name = metric_config["name"]
    display_name = metric_config["display_name"]
    mean_analyzer = mean_analyzers[name]
    
    plt.figure(figsize=(12, 8))
    plt.suptitle(f"{name} Scaling Analysis", fontsize=16)
    
    # Plot 1: All families together with lines of best fit
    plt.subplot(2, 2, 1)
    for model_family in np.unique(mean_analyzer.df["Model Family"]):
        family_data = mean_analyzer.df[mean_analyzer.df["Model Family"] == model_family]
        plt.scatter(
            family_data["#Params (B)"],
            family_data[f"Mean {display_name}"],
            label=model_family,
            alpha=0.7,
            s=80,
        )

        # Add fitted line
        if len(family_data) > 1:
            x_line, y_line, r2, slope = log_linear_fit(family_data, "#Params (B)", f"Mean {display_name}")
            plt.plot(
                x_line,
                y_line,
                "--",
                alpha=0.7,
                label=f"{model_family}: R² = {r2:.3f}",
            )

    plt.xscale("log")
    plt.xlabel("Model Size (params)")
    plt.ylabel(f"Mean {name}")
    plt.title(f"{name} vs Model Size")
    plt.legend(fontsize=9, loc="best")
    plt.grid(True, alpha=0.3)
    
    # Plot 2: GPQA correlation
    plt.subplot(2, 2, 2)
    mask = ~np.isnan(mean_analyzer.df["GPQA"]) & ~np.isnan(mean_analyzer.df[f"Mean {display_name}"])
    if sum(mask) > 1:
        x = mean_analyzer.df["GPQA"][mask]
        y_data = mean_analyzer.df[f"Mean {display_name}"][mask]
        
        # Scatter plot with model families
        for model_family in np.unique(mean_analyzer.df["Model Family"]):
            family_mask = mask & (mean_analyzer.df["Model Family"] == model_family)
            if sum(family_mask) > 0:
                plt.scatter(
                    mean_analyzer.df["GPQA"][family_mask],
                    mean_analyzer.df[f"Mean {display_name}"][family_mask],
                    label=model_family,
                    alpha=0.7,
                    s=60
                )
        
        # Add line of best fit
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y_data)
        line_x = np.linspace(min(x), max(x), 100)
        line_y = intercept + slope * line_x
        plt.plot(line_x, line_y, "r-", linewidth=2, alpha=0.8)
        
        # Add confidence interval
        if len(x) > 2:
            mean_x = np.mean(x)
            n = len(x)
            df_res = n - 2
            mse = np.sum((y_data - (intercept + slope * x)) ** 2) / df_res
            se = np.sqrt(mse * (1 + 1 / n + (line_x - mean_x) ** 2 / np.sum((x - mean_x) ** 2)))
            t_val = stats.t.ppf(0.975, df_res)
            ci = t_val * se
            plt.fill_between(line_x, line_y - ci, line_y + ci, color="r", alpha=0.2)
        
        plt.text(0.05, 0.95, f"R² = {r_value**2:.3f}", transform=plt.gca().transAxes, 
                verticalalignment="top", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8))
    
    plt.xlabel("GPQA Score")
    plt.ylabel(f"Mean {name}")
    plt.title(f"{name} vs GPQA Performance")
    plt.legend(fontsize=8)
    plt.grid(True, alpha=0.3)
    
    # Plot 3: Distribution across model families (box plot)
    plt.subplot(2, 2, 3)
    family_data_list = []
    family_labels = []
    for family in np.unique(mean_analyzer.df["Model Family"]):
        family_values = mean_analyzer.df[mean_analyzer.df["Model Family"] == family][f"Mean {display_name}"]
        if len(family_values) > 0:
            family_data_list.append(family_values)
            family_labels.append(family)
    
    plt.boxplot(family_data_list, labels=family_labels)
    plt.ylabel(f"Mean {name}")
    plt.title(f"{name} Distribution by Model Family")
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Plot 4: All evaluation metrics correlation
    plt.subplot(2, 2, 4)
    eval_metrics = ["IFEval", "BBH", "MATH Lvl 5", "MUSR", "MMLU-PRO", "Average ⬆️"]
    correlations = []
    eval_names = []
    
    for eval_metric in eval_metrics:
        mask = ~np.isnan(mean_analyzer.df[eval_metric]) & ~np.isnan(mean_analyzer.df[f"Mean {display_name}"])
        if sum(mask) > 1:
            corr, _ = stats.pearsonr(
                mean_analyzer.df[eval_metric][mask], 
                mean_analyzer.df[f"Mean {display_name}"][mask]
            )
            correlations.append(corr)
            eval_names.append(eval_metric)
    
    if correlations:
        colors = ['red' if c < 0 else 'green' for c in correlations]
        bars = plt.bar(range(len(correlations)), correlations, color=colors, alpha=0.7)
        plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        plt.ylabel(f"Correlation with {name}")
        plt.title(f"{name} vs Evaluation Metrics")
        plt.xticks(range(len(eval_names)), eval_names, rotation=45, ha='right')
        plt.grid(True, alpha=0.3)
        
        # Add correlation values on bars
        for i, (bar, corr) in enumerate(zip(bars, correlations)):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01 if corr > 0 else bar.get_height() - 0.03,
                    f'{corr:.3f}', ha='center', va='bottom' if corr > 0 else 'top', fontsize=8)
    
    plt.tight_layout()
    plt.show()
    print(f"\\nCompleted comprehensive analysis for {name}")
    print("-" * 50)

In [20]:
# Create category analysis for all metrics systematically
import seaborn as sns

for metric_config in metrics_config:
    name = metric_config["name"]
    display_name = metric_config["display_name"]
    analyzer = analyzers[name]
    
    print(f"\\n=== {name} Category Analysis ===")
    
    # Calculate mean by category for each metric
    mean_category_analyzer = analyzer.calculate_metric(
        metric_func="mean",
        group_by_cols=["Language Model", "class_category"],
        metric_col=display_name,
        metric_name=f"Mean {display_name}",
        inherit_identical_values=True,
    )

    # Calculate counts for sample size information
    counts = (
        analyzer.df
        .groupby(['Language Model', 'class_category'])
        .size()
        .reset_index(name='n')
    )

    # Merge counts into the mean df
    df = mean_category_analyzer.df.copy()
    df = df.merge(counts, on=['Language Model', 'class_category'], how='left')

    def normalize_category(cat):
        return cat.replace('_', ' ').title()

    df['category_with_n'] = df['class_category'].apply(normalize_category) + '\\n(n=' + df['n'].astype(str) + ')'

    # Select the largest model per family for cleaner visualization
    largest_models_per_family = mean_analyzers[name].df.groupby('Model Family')['#Params (B)'].idxmax()
    largest_model_names = mean_analyzers[name].df.loc[largest_models_per_family]['Language Model'].values
    df_filtered = df[df['Language Model'].isin(largest_model_names)]

    # Create bar plot
    plt.figure(figsize=(14, 8))
    sns.barplot(data=df_filtered, x='category_with_n', y=f'Mean {display_name}', hue='Language Model')
    plt.xlabel('Class Category')
    plt.xticks(rotation=90)
    plt.ylabel(f'{name}')
    plt.legend(title='Model (Largest per Family)', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(f'{name} Distribution Across Class Categories\\n(Largest model per family)')
    plt.tight_layout()
    plt.show()

## Pythia Training Steps Analysis: All Metrics

Analyze how all coherence metrics change during training for Pythia models, instead of just BCE.

In [21]:
# All metrics across training steps (pythia models only)
pythia_data_analyzer = data_analyzer

pythia_data_analyzer.filter({"Model Family": ["Pythia"]})

print("Pythia models and training steps available:")
print(pythia_data_analyzer.df[["Training Steps", "Language Model"]].drop_duplicates().sort_values(["Language Model", "Training Steps"]))

In [22]:
# Calculate all metrics for Pythia models systematically
pythia_metrics = {}
metric_col_names = {
    "BCE": "BCE (Pairwise MSE)",
    "NBCE": "NBCE",
    "CBC": "CBC", 
    "SCS": "SCS",
    "RBC": "RBC"
}

for metric_config in metrics_config:
    name = metric_config["name"]
    display_name = metric_config["display_name"]
    
    pythia_metrics[name] = pythia_data_analyzer.calculate_metric(
        metric_name=display_name,
        metric_func=metric_config["func"],
        group_by_cols=[
            "evidence_text",
            "class_category",
            "Language Model",
            "Training Steps",
            "conversation_history"
        ],
        log_prior_col="prior_logprob",
        log_likelihood_col="likelihood_logprob",
        log_posterior_col="posterior_logprob",
        inherit_identical_values=True,
        **metric_config["kwargs"]
    )

# Calculate means for all metrics
pythia_mean_metrics = {}

for metric_name, analyzer in pythia_metrics.items():
    pythia_mean_metrics[metric_name] = analyzer.calculate_metric(
        metric_func="mean",
        group_by_cols=["Language Model", "Training Steps"],
        metric_col=metric_col_names[metric_name],
        metric_name=f"Mean {metric_col_names[metric_name]}",
        inherit_identical_values=True,
    )
    
    # Set sort order
    pythia_mean_metrics[metric_name].sort(
        {
            "#Params (B)": lambda x: x,
            "Training Steps": [
                "33k",
                "66k",
                "99k",
                "143k",
                "unknown",
            ],
        }
    )

print("Pythia metrics calculated:")
for name, analyzer in pythia_mean_metrics.items():
    print(f"{name}: {analyzer.df.shape}")

# For backwards compatibility
mean_pythia_bce_mse_analyzer = pythia_mean_metrics["BCE"]

In [23]:
# Create comprehensive comparison plots for all metrics across training steps
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('All Coherence Metrics vs Training Steps (Pythia Models)', fontsize=16)

axes = axes.flatten()
pythia_metrics_to_plot = ["BCE", "NBCE", "CBC", "SCS", "RBC"]

for i, metric in enumerate(pythia_metrics_to_plot):
    ax = axes[i]
    
    # Get the correct column name for the y-axis
    y_col = f"Mean {metric_col_names[metric]}"
    
    sns.lineplot(
        data=pythia_mean_metrics[metric].df, 
        x='Training Steps', 
        y=y_col, 
        hue='Language Model',
        ax=ax,
        marker='o'
    )
    
    ax.set_xlabel('Training Steps')
    ax.set_ylabel(f'{metric}')
    ax.set_title(f'{metric} vs Training Steps')
    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)

# Remove empty subplot
axes[5].remove()

plt.tight_layout()
plt.show()

print("\\n=== Training Step Analysis Summary ===")
print("Changes from 33k to 143k training steps:")
print("-" * 50)

for metric in pythia_metrics_to_plot:
    print(f"\\n{metric}:")
    pythia_data = pythia_mean_metrics[metric].df
    
    # Calculate improvement/change from first to last checkpoint
    for model in pythia_data["Language Model"].unique():
        model_data = pythia_data[pythia_data["Language Model"] == model]
        if len(model_data) >= 2:
            steps = ["33k", "66k", "99k", "143k"]
            available_steps = [s for s in steps if s in model_data["Training Steps"].values]
            if len(available_steps) >= 2:
                first_step = available_steps[0]
                last_step = available_steps[-1]
                
                y_col = f"Mean {metric_col_names[metric]}"
                first_val = model_data[model_data["Training Steps"] == first_step][y_col].iloc[0]
                last_val = model_data[model_data["Training Steps"] == last_step][y_col].iloc[0]
                
                change = last_val - first_val
                percent_change = (change / first_val * 100) if first_val != 0 else 0
                
                direction = "↑" if change > 0 else "↓" if change < 0 else "→"
                print(f"  {model}: {first_val:.4f} → {last_val:.4f} ({direction} {abs(percent_change):.1f}%)")