# Compare Two WandB Runs Side by Side

This notebook fetches and visualizes data from two experiment runs for easy comparison.

In [None]:
# Import Required Libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, Any, Optional, List
import json
from collections import defaultdict

# Import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from IPython.display import display, HTML, Markdown

# Import wandb
import wandb
from wandb.apis.public import Run

# Set style for plots
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("Libraries imported successfully!")

In [None]:
# Initialize Experiment Tracking - Define Run IDs
RUN_IDS = ['poxmea6n', 'buqt4b6u']
ENTITY = 'mllab-ts-universit-di-trieste'
PROJECT = 'CounterFactualDPG'

print(f"Comparing {len(RUN_IDS)} runs:")
for i, run_id in enumerate(RUN_IDS, 1):
    print(f"  {i}. {run_id}")
print(f"\nEntity: {ENTITY}")
print(f"Project: {PROJECT}")

In [None]:
# Load Run Data from WandB
api = wandb.Api()

def fetch_run_data(run_id: str) -> Dict[str, Any]:
    """Fetch comprehensive information from a WandB run."""
    run_path = f"{ENTITY}/{PROJECT}/{run_id}"
    print(f"Fetching run: {run_path}")
    
    try:
        run = api.run(run_path)
        
        # Collect all run information
        run_data = {
            'meta': {
                'id': run.id,
                'name': run.name,
                'display_name': run.display_name,
                'state': run.state,
                'url': run.url,
                'path': run.path,
                'entity': run.entity,
                'project': run.project,
                'created_at': run.created_at,
                'updated_at': getattr(run, 'updated_at', None),
                'notes': run.notes,
                'tags': list(run.tags) if run.tags else [],
                'group': run.group,
                'job_type': run.job_type,
            },
            'config': dict(run.config),
            'summary': {},
            'history': [],
            'history_keys': [],
            'system_metrics': {},
            'files': [],
            'artifacts': [],
        }
        
        # Get summary metrics
        for key, value in run.summary.items():
            if not key.startswith('_'):
                try:
                    run_data['summary'][key] = float(value)
                except (ValueError, TypeError):
                    run_data['summary'][key] = value
        
        # Get history (time-series data)
        try:
            history = run.history(pandas=False)
            if history:
                run_data['history'] = list(history)
                # Extract unique keys from history
                all_keys = set()
                for row in history:
                    all_keys.update(row.keys())
                run_data['history_keys'] = sorted(list(all_keys))
        except Exception as e:
            print(f"  Warning: Could not fetch history: {e}")
        
        # Get files
        try:
            files = run.files()
            run_data['files'] = [
                {
                    'name': f.name,
                    'size': f.size,
                    'mimetype': getattr(f, 'mimetype', None),
                    'url': f.url,
                }
                for f in files
            ]
        except Exception as e:
            print(f"  Warning: Could not fetch files: {e}")
        
        # Get artifacts
        try:
            artifacts = run.logged_artifacts()
            run_data['artifacts'] = [
                {
                    'name': a.name,
                    'type': a.type,
                    'version': a.version,
                    'size': a.size,
                }
                for a in artifacts
            ]
        except Exception as e:
            print(f"  Warning: Could not fetch artifacts: {e}")
        
        print(f"  ‚úì Successfully fetched {len(run_data['history'])} history steps")
        return run_data
        
    except Exception as e:
        print(f"  ‚úó Error fetching run: {e}")
        return None

# Fetch data for all runs
runs_data = {}
for run_id in RUN_IDS:
    runs_data[run_id] = fetch_run_data(run_id)

print(f"\n‚úì Loaded data for {len([r for r in runs_data.values() if r])}/{len(RUN_IDS)} runs")

In [None]:
# Extract Metrics and Parameters for Comparison
def format_value(value, max_length=50):
    """Format values for display."""
    if value is None:
        return "N/A"
    if isinstance(value, (list, dict)):
        return f"{type(value).__name__}({len(value)})"
    str_val = str(value)
    if len(str_val) > max_length:
        return str_val[:max_length-3] + "..."
    return str_val

# Create comparison DataFrames
comparison_data = {
    'Metric': []
}
for i, run_id in enumerate(RUN_IDS):
    comparison_data[f"Run {i+1} ({run_id})"] = []

# Metadata comparison
meta_fields = ['name', 'state', 'created_at', 'tags', 'group', 'job_type']
for field in meta_fields:
    comparison_data['Metric'].append(f"meta.{field}")
    for i, run_id in enumerate(RUN_IDS):
        comparison_data[f"Run {i+1} ({run_id})"].append(format_value(runs_data[run_id]['meta'].get(field)))

# Summary metrics - find common keys
all_summary_keys = [set(runs_data[run_id]['summary'].keys()) for run_id in RUN_IDS]
common_summary_keys = sorted(set.intersection(*all_summary_keys))

for key in common_summary_keys:
    comparison_data['Metric'].append(f"summary.{key}")
    for i, run_id in enumerate(RUN_IDS):
        comparison_data[f"Run {i+1} ({run_id})"].append(format_value(runs_data[run_id]['summary'].get(key)))

# Configuration comparison
all_config_keys = [set(runs_data[run_id]['config'].keys()) for run_id in RUN_IDS]
common_config_keys = sorted(set.intersection(*all_config_keys))

for key in common_config_keys:
    comparison_data['Metric'].append(f"config.{key}")
    for i, run_id in enumerate(RUN_IDS):
        comparison_data[f"Run {i+1} ({run_id})"].append(format_value(runs_data[run_id]['config'].get(key)))

df_comparison = pd.DataFrame(comparison_data)
print("‚úì Extracted metrics and parameters for comparison")
display(df_comparison.head(20))

In [None]:
# Create Side-by-Side Comparison Tables

# 1. Metadata Comparison
meta_data = [runs_data[run_id]['meta'] for run_id in RUN_IDS]
df_meta = pd.DataFrame(meta_data, index=[f"Run {i+1} ({run_id})" for i, run_id in enumerate(RUN_IDS)]).T

# 2. Summary Metrics Comparison
df_summary = pd.DataFrame({
    f"Run {i+1} ({run_id})": runs_data[run_id]['summary']
    for i, run_id in enumerate(RUN_IDS)
})

# 3. Configuration Comparison
df_config = pd.DataFrame({
    f"Run {i+1} ({run_id})": runs_data[run_id]['config']
    for i, run_id in enumerate(RUN_IDS)
})

# Display side by side
HTML("""
<div style="display: flex; gap: 20px; overflow-x: auto;">
    <div style="flex: 1;">
        <h3 style="color: #2e86de; margin-bottom: 10px;">üìã Run Metadata</h3>
        {meta_html}
    </div>
    <div style="flex: 1;">
        <h3 style="color: #10ac84; margin-bottom: 10px;">üìä Summary Metrics</h3>
        {summary_html}
    </div>
</div>
""".format(
    meta_html=df_meta.to_html(classes="data-table"),
    summary_html=df_summary.head(20).to_html(classes="data-table")
))

In [None]:
# Visualize Training History - Side by Side

# Find common history keys
all_history_keys = [set(runs_data[run_id]['history_keys']) for run_id in RUN_IDS]
common_metrics = sorted(set.intersection(*all_history_keys))

print(f"Common metrics found in history: {len(common_metrics)}")
print(f"Metrics: {common_metrics}")

# Filter to numeric metrics
numeric_metrics = []
for key in common_metrics:
    # Check if this is a numeric metric
    is_numeric = True
    for run_id in RUN_IDS:
        for row in runs_data[run_id]['history']:
            if key in row:
                val = row[key]
                if val is not None and not isinstance(val, (int, float)):
                    is_numeric = False
                    break
    if is_numeric and any(key in row for run_id in RUN_IDS for row in runs_data[run_id]['history']):
        numeric_metrics.append(key)

print(f"\nNumeric metrics to plot: {numeric_metrics}")

# Create subplots for each metric
if numeric_metrics:
    n_metrics = min(len(numeric_metrics), 8)  # Limit to 8 metrics
    n_cols = 2
    n_rows = (n_metrics + n_cols - 1) // n_cols
    
    fig = make_subplots(
        rows=n_rows, cols=n_cols,
        subplot_titles=numeric_metrics[:n_metrics]
    )
    
    colors = ['#2e86de', '#10ac84', '#9b59b6', '#f39c12', '#e74c3c', '#1abc9c', '#34495e'][:len(RUN_IDS)]
    
    for idx, metric in enumerate(numeric_metrics[:n_metrics]):
        row = (idx // n_cols) + 1
        col = (idx % n_cols) + 1
        
        for i, run_id in enumerate(RUN_IDS):
            history = runs_data[run_id]['history']
            steps = []
            values = []
            for row_data in history:
                if metric in row_data:
                    steps.append(row_data.get('_step', len(values)))
                    values.append(row_data[metric])
            
            if steps and values:
                fig.add_trace(
                    go.Scatter(
                        x=steps,
                        y=values,
                        mode='lines+markers',
                        name=f"Run {i+1} ({run_id})",
                        legendgroup=f"group_{i}" if idx == 0 else None,
                        showlegend=(idx == 0),
                        line=dict(color=colors[i], width=2),
                        marker=dict(size=4)
                    ),
                    row=row, col=col
                )
    
    fig.update_layout(
        height=300 * n_rows,
        title_text="<b>Training History: Side-by-Side Comparison</b>",
        hovermode='x unified',
        template='plotly_white'
    )
    
    fig.show()
else:
    print("No numeric metrics found in history to plot.")

In [None]:
# Compare Model Performance - Summary Metrics Bar Chart

# Extract numeric summary metrics for comparison
metric_values = []
metric_names = []

# Get all common summary keys
all_summary_keys = [set(runs_data[run_id]['summary'].keys()) for run_id in RUN_IDS]
common_summary_metrics = set.intersection(*all_summary_keys)

for key in common_summary_metrics:
    if not key.startswith('_') and isinstance(runs_data[RUN_IDS[0]]['summary'].get(key), (int, float)):
        try:
            values = []
            for run_id in RUN_IDS:
                values.append(float(runs_data[run_id]['summary'].get(key, 0)))
            
            # Filter out extreme values for better visualization
            if all(abs(v) < 1e6 for v in values):
                metric_names.append(key)
                metric_values.append(values)
        except (ValueError, TypeError):
            pass

if metric_values:
    column_names = [f"Run {i+1} ({run_id})" for i, run_id in enumerate(RUN_IDS)]
    df_metrics = pd.DataFrame(metric_values, columns=column_names, index=metric_names)
    
    # Bar chart comparison
    fig = go.Figure()
    
    colors = ['#2e86de', '#10ac84', '#9b59b6', '#f39c12', '#e74c3c', '#1abc9c', '#34495e']
    
    for i, run_id in enumerate(RUN_IDS):
        col_name = f"Run {i+1} ({run_id})"
        fig.add_trace(go.Bar(
            name=col_name,
            x=metric_names,
            y=df_metrics[col_name],
            marker_color=colors[i % len(colors)]
        ))
    
    fig.update_layout(
        title='<b>Summary Metrics Comparison</b>',
        xaxis_title='Metric',
        yaxis_title='Value',
        barmode='group',
        height=600,
        hovermode='x unified',
        xaxis={'tickangle': -45},
        template='plotly_white'
    )
    
    fig.show()
else:
    print("No numeric summary metrics found for comparison.")

In [None]:
# Metric Difference Analysis

# Calculate percentage differences (comparing first run to others)
differences = []
all_summary_keys = [set(runs_data[run_id]['summary'].keys()) for run_id in RUN_IDS]
common_keys = set.intersection(*all_summary_keys)

for key in common_keys:
    if not key.startswith('_') and isinstance(runs_data[RUN_IDS[0]]['summary'].get(key), (int, float)):
        try:
            baseline_val = float(runs_data[RUN_IDS[0]]['summary'][key])
            
            # Compare with other runs
            for i in range(1, len(RUN_IDS)):
                run_id = RUN_IDS[i]
                if (isinstance(runs_data[run_id]['summary'].get(key), (int, float))):
                    compare_val = float(runs_data[run_id]['summary'][key])
                    
                    if abs(baseline_val) > 1e-10:  # Avoid division by zero
                        pct_diff = ((compare_val - baseline_val) / abs(baseline_val)) * 100
                        absolute_diff = compare_val - baseline_val
                        
                        # Filter for meaningful differences
                        if abs(baseline_val) < 1e6 and abs(compare_val) < 1e6:
                            differences.append({
                                'Metric': key,
                                'Baseline (Run 1)': baseline_val,
                                'Comparison (Run ' + str(i+1) + ')': compare_val,
                                'Absolute Diff': absolute_diff,
                                'Percent Diff (%)': pct_diff,
                                'Better Run': f'Run {i+1}' if pct_diff > 0 else 'Run 1',
                                'Compared with': RUN_IDS[i]
                            })
        except (ValueError, TypeError):
            pass

if differences:
    df_diff = pd.DataFrame(differences).sort_values('Absolute Diff', key=abs, ascending=False)
    
    print("Top Metric Differences:")
    df_diff_top = df_diff.head(15)
    display(df_diff_top)
    
    # Visualization of differences
    fig = go.Figure()
    
    # Color based on whether Run 2 improved
    colors = ['#10ac84' if diff > 0 else '#ee5253' for diff in df_diff_top['Percent Diff (%)']]
    
    fig.add_trace(go.Bar(
        x=df_diff_top['Metric'],
        y=df_diff_top['Percent Diff (%)'],
        marker_color=colors,
        hovertemplate='%{x}<br>Percent Change: %{y:.2f}%<extra></extra>',
    ))
    
    # Add horizontal line at 0
    fig.add_hline(y=0, line_dash="dash", line_color="gray", opacity=0.5)
    
    fig.update_layout(
        title='<b>Percentage Difference: Run 2 vs Run 1</b>',
        xaxis_title='Metric',
        yaxis_title='Percentage Difference (%)',
        height=600,
        xaxis={'tickangle': -45},
        template='plotly_white',
        annotations=[
            dict(x=0.02, y=0.98, 
                 text=f'<span style="color:#10ac84">‚óè Run {len(RUN_IDS)} better</span>',
                 showarrow=False, xref='paper', yref='paper', 
                 xanchor='left', yanchor='top', bgcolor='rgba(255,255,255,0.9)',
                 bordercolor='gray', borderwidth=1)
        ]
    )
    
    fig.show()
else:
    print("No comparable metrics found for difference analysis.")

In [None]:
# Compare Files and Artifacts

print("üìÅ Files Comparison")
print("=" * 80)

# Build dynamic column names based on RUN_IDS
files_data = {'File Name': []}
for i, run_id in enumerate(RUN_IDS):
    files_data[f'Run {i+1} Size (KB)'] = []
files_data['Available In'] = []

# Get all unique file names
all_files = set()
for run_id in RUN_IDS:
    for f in runs_data[run_id]['files']:
        all_files.add(f['name'])

for filename in sorted(all_files):
    available_in = []
    for i, run_id in enumerate(RUN_IDS):
        if filename in [f['name'] for f in runs_data[run_id]['files']]:
            available_in.append(f'Run {i+1}')
            f = next((f for f in runs_data[run_id]['files'] if f['name'] == filename), None)
            if f:
                files_data[f'Run {i+1} Size (KB)'].append(f"{f['size'] / 1024:.2f}" if f['size'] else "Unknown")
        else:
            files_data[f'Run {i+1} Size (KB)'].append("N/A")
    
    files_data['File Name'].append(filename)
    files_data['Available In'].append(', '.join(available_in))

df_files = pd.DataFrame(files_data)
display(df_files.head(20))

print("\nüì¶ Artifacts Comparison")
print("=" * 80)

# Build dynamic column names based on RUN_IDS
artifacts_data = {'Artifact': [], 'Type': []}
for i, run_id in enumerate(RUN_IDS):
    artifacts_data[f'Run {i+1} Version'] = []
artifacts_data['Available In'] = []

# Get all unique artifact names
all_artifacts = set()
for run_id in RUN_IDS:
    for a in runs_data[run_id]['artifacts']:
        all_artifacts.add(a['name'])

for artifact_name in sorted(all_artifacts):
    available_in = []
    artifact_type = "N/A"
    
    for i, run_id in enumerate(RUN_IDS):
        if artifact_name in [a['name'] for a in runs_data[run_id]['artifacts']]:
            available_in.append(f'Run {i+1}')
            a = next((a for a in runs_data[run_id]['artifacts'] if a['name'] == artifact_name), None)
            if a:
                artifacts_data[f'Run {i+1} Version'].append(a['version'])
                artifact_type = a['type']
        else:
            artifacts_data[f'Run {i+1} Version'].append("N/A")
    
    artifacts_data['Artifact'].append(artifact_name)
    artifacts_data['Type'].append(artifact_type)
    artifacts_data['Available In'].append(', '.join(available_in))

df_artifacts = pd.DataFrame(artifacts_data)
display(df_artifacts)

print(f"\nSummary:")
for i, run_id in enumerate(RUN_IDS, 1):
    print(f"  Run {i} Files: {len(runs_data[run_id]['files'])}")
    print(f"  Run {i} Artifacts: {len(runs_data[run_id]['artifacts'])}")

In [None]:
# Display Images from Runs Side by Side

import requests
from PIL import Image
from io import BytesIO
import base64
from urllib.parse import urlparse

def display_images_side_by_side(images, image_name):
    """Display multiple images side by side."""
    
    def image_to_html(img, title, color):
        """Convert PIL image to HTML."""
        if img is None:
            return f"""
            <div style="text-align: center; flex: 1; min-width: 300px; padding: 10px;">
                <h4 style="color: {color}; margin-bottom: 10px;">{title}</h4>
                <p style="color: #dc3545; font-style: italic;">Image not found</p>
            </div>
            """
        buffered = BytesIO()
        img.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return f"""
        <div style="text-align: center; flex: 1; min-width: 300px; padding: 10px;">
            <h4 style="color: {color}; margin-bottom: 10px;">{title}</h4>
            <img src="data:image/png;base64,{img_str}" style="max-width: 100%; height: auto; border: 2px solid #ddd; border-radius: 5px;">
        </div>
        """
    
    colors = ['#2e86de', '#10ac84', '#9b59b6', '#f39c12', '#e74c3c', '#1abc9c', '#34495e']
    
    html_divs = []
    for i, (run_id, image) in enumerate(zip(RUN_IDS, images)):
        html_divs.append(image_to_html(image, f'Run {i+1} ({run_id})', colors[i % len(colors)]))
    
    image_divs = '\n'.join(html_divs)
    
    html = f"""
    <div style="display: flex; gap: 20px; flex-wrap: wrap; margin-bottom: 30px;">
        <div style="width: 100%; text-align: center; padding: 10px; background: #f8f9fa; border-radius: 5px; margin-bottom: 10px;">
            <h3 style="margin: 0;">üñºÔ∏è {image_name}</h3>
        </div>
        {image_divs}
    </div>
    """
    return html

def download_image(url):
    """Download image from URL and return PIL Image."""
    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            return Image.open(BytesIO(response.content))
    except Exception as e:
        print(f"  Error downloading image from {url}: {e}")
    return None

def get_image_files(run_data):
    """Get all image files from run data."""
    image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp'}
    image_files = []
    
    for f in run_data['files']:
        name_lower = f['name'].lower()
        if any(name_lower.endswith(ext) for ext in image_extensions):
            image_files.append(f)
    
    return image_files

print("üñºÔ∏è Image Comparison")
print("=" * 80)

# Get image files from all runs
all_run_images = {}
for run_id in RUN_IDS:
    all_run_images[run_id] = get_image_files(runs_data[run_id])

for i, run_id in enumerate(RUN_IDS):
    print(f"Run {i+1}: {len(all_run_images[run_id])} images found")
    for img in all_run_images[run_id]:
        size_str = f"{img['size'] / 1024:.2f} KB" if img['size'] else "Unknown"
        print(f"  - {img['name']} ({size_str})")

# Collect all unique image names for comparison
all_image_names = set()
for run_id in RUN_IDS:
    all_image_names.update([img['name'] for img in all_run_images[run_id]])

print(f"\n{'=' * 80}")
print("Displaying Images Side by Side")
print("=" * 80)

if not all_image_names:
    print("‚ö†Ô∏è  No images found in either run.")
else:
    # Display each image comparison
    for img_name in sorted(all_image_names):
        # Download images from all runs
        run_images = []
        for run_id in RUN_IDS:
            img_file = next((f for f in all_run_images[run_id] if f['name'] == img_name), None)
            image = None
            if img_file:
                print(f"Downloading from run {run_id}: {img_name}")
                image = download_image(img_file['url'])
            run_images.append(image)
        
        # Display side by side
        display(HTML(display_images_side_by_side(run_images, img_name)))
        print()

print("‚úì Image comparison complete!")

In [None]:
# Summary and Quick Links

print("=" * 80)
print("üìä RUN COMPARISON SUMMARY")
print("=" * 80)

for i, run_id in enumerate(RUN_IDS, 1):
    meta = runs_data[run_id]['meta']
    summary = runs_data[run_id]['summary']
    
    print(f"\n{'‚îÄ' * 80}")
    print(f"üèÉ RUN {i}: {run_id}")
    print(f"{'‚îÄ' * 80}")
    print(f"Name:        {meta['name']}")
    print(f"Display:     {meta['display_name']}")
    print(f"State:       {meta['state']}")
    print(f"Created:     {meta['created_at']}")
    print(f"Tags:        {', '.join(meta['tags']) if meta['tags'] else 'None'}")
    print(f"Group:       {meta['group'] or 'None'}")
    print(f"Job Type:    {meta['job_type']}")
    print(f"\nüîó WandB URL: {meta['url']}")
    print(f"\nüìà History:   {len(runs_data[run_id]['history'])} steps, {len(runs_data[run_id]['history_keys'])} metrics")
    print(f"üìÅ Files:     {len(runs_data[run_id]['files'])}")
    print(f"üì¶ Artifacts: {len(runs_data[run_id]['artifacts'])}")

print(f"\n{'=' * 80}")
print("Key Statistics")
print("=" * 80)

print(f"\nConfiguration parameters:")
for i, run_id in enumerate(RUN_IDS, 1):
    print(f"  Run {i}: {len(runs_data[run_id]['config'])} parameters")

print(f"\nSummary metrics:")
for i, run_id in enumerate(RUN_IDS, 1):
    print(f"  Run {i}: {len(runs_data[run_id]['summary'])} metrics")

print(f"\nCommon summary metrics: {len(common_summary_keys)}")
print(f"Common config keys: {len(common_config_keys)}")
print(f"Common history metrics: {len(common_metrics)}")

print(f"\n{'=' * 80}")
print("‚úì Comparison complete!")
print("=" * 80)

---

## üìù Notebook Guide

This notebook provides a comprehensive side-by-side comparison of two WandB experiment runs.

**What's included:**

| Section | Description |
|---------|-------------|
| **Imports** | Loads required libraries for data manipulation and visualization |
| **Run Definitions** | Defines run IDs in `RUN_IDS` constant (edit this to compare different runs) |
| **Data Loading** | Fetches metadata, config, metrics, history, files, and artifacts from all runs |
| **Comparison Tables** | Displays metadata, summary, and configuration side-by-side |
| **Training History** | Plots time-series metrics for all runs with different colors |
| **Performance Metrics** | Bar chart comparison of summary metrics |
| **Difference Analysis** | Calculates and visualizes percentage differences between runs |
| **Files/Artifacts** | Compares log files and artifacts between runs |
| **Images** | Downloads and displays all images from all runs side by side for visual comparison |
| **Summary** | Displays quick links and key statistics |

**To compare different runs:**
- Edit the `RUN_IDS` list in Cell 3

**Color coding:**
- üîµ **Blue**: Run 1 (`poxmea6n`)
- üü¢ **Green**: Run 2 (`buqt4b6u`)
- ‚úÖ **Green difference**: Run 2 performed better
- ‚ùå **Red difference**: Run 1 performed better