In [1]:
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
import time
from pathlib import Path
import gc
import torch_pruning as tp
import re

print("--- Notebook Setup: Imports completed ---")

# --- Configuration ---
ROOT_DIR = "saved_models_and_logs"
OUTPUT_CSV_NB = "model_advanced_inference_benchmark.csv" # New output file name
DEFAULT_NUM_CLASSES = 1000
FIXED_NUM_CLASSES = 1000 # For model reconstruction consistency

# --- BENCHMARKING Configuration ---
# Using the parameters from your provided script
DUMMY_INPUT_SHAPE = (32, 3, 224, 224) # Batch Size of 32 for higher throughput measurement
NUM_WARMUP_RUNS = 1
NUM_BENCHMARK_RUNS = 2

# --- Device and Input Tensors ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

INPUT_TENSOR_CPU = torch.randn(DUMMY_INPUT_SHAPE)

# Your list of models that are unstable on GPU
GPU_UNSTABLE_QUANTIZED_MODELS = [
    "resnet18pretrained_distilled_quant_ptq_int8_perchannel_post",
    "resnet18pretrained_distilled_quant_ptq_int8_pertensor_post",
    "resnet18pretrained_distilled_quant_qat_int8_epochs8",
    "resnet50_quant_ptq_int8_perchannel_post",
    "resnet50_quant_ptq_int8_pertensor_post",
    "resnet50_quant_qat_int8_epochs8",
]

--- Notebook Setup: Imports completed ---
Using device: cuda


In [2]:
# --- Helper: Model File ---
def get_model_file_path_nb(experiment_path_str):
    experiment_path = Path(experiment_path_str)
    specific_model_file = experiment_path / "model_final.pth"
    if specific_model_file.exists(): return str(specific_model_file)
    pth_files = list(experiment_path.glob("*.pth"))
    if pth_files:
        if any("baseline_ft_imagenetmini_final.pth" in p.name for p in pth_files):
            return str([p for p in pth_files if "baseline_ft_imagenetmini_final.pth" in p.name][0])
        return str(pth_files[0])
    return None

# --- Model Definition and Pruning Application ---
def get_base_resnet50_model_for_reconstruction_nb():
    return models.resnet50(weights=None, num_classes=FIXED_NUM_CLASSES)

def apply_structured_pruning_to_model_for_reconstruction_nb(model, example_inputs, rate, device_obj):
    model.to(device_obj)
    example_inputs = example_inputs.to(device_obj)
    ignored_layers = [m for m in model.modules() if isinstance(m, nn.Linear) and m.out_features == FIXED_NUM_CLASSES]
    pruner = tp.pruner.MagnitudePruner(
        model=model, example_inputs=example_inputs, importance=tp.importance.MagnitudeImportance(p=1),
        iterative_steps=1, pruning_ratio=rate, global_pruning=False, ignored_layers=ignored_layers
    )
    pruner.step()
    return model

def get_pruning_config_from_log_for_reconstruction_nb(log_file_path_str):
    if not Path(log_file_path_str).exists(): return None
    try:
        with open(log_file_path_str, 'r') as f: log_data = json.load(f)
        cfg = log_data.get('config_details', {})
        if cfg.get('target_filter_pruning_rate_per_layer') is not None:
            return {'type': 'one-shot', 'rate': float(cfg['target_filter_pruning_rate_per_layer'])}
        if cfg.get('applied_step_rate_for_this_stage') is not None:
            return {'type': 'iterative_step', 'rate': float(cfg['applied_step_rate_for_this_stage'])}
    except Exception: return None
    return None

def _reconstruct_model_arch_and_load_weights_nb(model_path, device, pruning_config, exp_id=""):
    if not pruning_config: return None
    reconstructed_model = get_base_resnet50_model_for_reconstruction_nb()
    example_inputs = torch.randn(1, 3, 224, 224, device=device) # small tensor for reconstruction
    try:
        if pruning_config['type'] == 'one-shot':
            reconstructed_model = apply_structured_pruning_to_model_for_reconstruction_nb(reconstructed_model, example_inputs, pruning_config['rate'], device)
        elif pruning_config['type'] == 'iterative':
            for rate in pruning_config.get('step_rates', []):
                reconstructed_model = apply_structured_pruning_to_model_for_reconstruction_nb(reconstructed_model, example_inputs, rate, device)
        else: return None
        if reconstructed_model is None: return None
        
        state_dict = torch.load(model_path, map_location=device, weights_only=True)
        if all(k.startswith('module.') for k in state_dict): state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
        reconstructed_model.load_state_dict(state_dict)
        reconstructed_model.eval()
        return reconstructed_model
    except Exception as e:
        print(f"      ERROR in reconstruction for {exp_id}: {e}")
        return None

# --- Central Model Loading Function ---
def load_model_for_experiment_nb(exp_info, all_experiments_df, target_device_str='cpu'):
    model_path = exp_info.get('Model_File_Path')
    exp_id = exp_info.get('Experiment_ID', 'Unknown_Exp')
    if not model_path or not os.path.exists(model_path):
        print(f"      ERROR ({exp_id}): Model file not found at {model_path}")
        return None
    device_to_load_on = torch.device(target_device_str)
    
    try: # Try JIT first
        return torch.jit.load(model_path, map_location=device_to_load_on).eval()
    except Exception: pass

    if exp_info.get('Is_Structured_Pruning', False):
        pruning_config = None
        base_exp_name = exp_info.get('Base_Exp_Name_Iterative')
        stage_num = exp_info.get('Stage_Num_Iterative')
        if base_exp_name and stage_num is not None: # Iterative
            stages_info = all_experiments_df[(all_experiments_df['Base_Exp_Name_Iterative'] == base_exp_name) & (all_experiments_df['Stage_Num_Iterative'] <= stage_num)].sort_values(by='Stage_Num_Iterative')
            rates = [get_pruning_config_from_log_for_reconstruction_nb(row.get('Log_Path'))['rate'] for _, row in stages_info.iterrows()]
            if rates: pruning_config = {'type': 'iterative', 'step_rates': rates}
        else: # One-shot
            pruning_config = get_pruning_config_from_log_for_reconstruction_nb(exp_info.get('Log_Path'))
        
        if pruning_config:
            reconstructed = _reconstruct_model_arch_and_load_weights_nb(model_path, device_to_load_on, pruning_config, exp_id)
            if reconstructed: return reconstructed

    try: # Fallback to standard loading
        base_arch = exp_info.get('Base_Model_Arch')
        num_classes = exp_info.get('Num_Classes', DEFAULT_NUM_CLASSES)
        if base_arch == "ResNet18": model_instance = models.resnet18(weights=None, num_classes=num_classes)
        elif base_arch == "ResNet50": model_instance = models.resnet50(weights=None, num_classes=num_classes)
        else: return None
        
        state_dict = torch.load(model_path, map_location=device_to_load_on)
        if isinstance(state_dict, dict) and 'model_state_dict' in state_dict: state_dict = state_dict['model_state_dict']
        if any(k.startswith('module.') for k in state_dict): state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
        model_instance.load_state_dict(state_dict)
        return model_instance.to(device_to_load_on).eval()
    except Exception as e:
        print(f"      ERROR ({exp_id}): Fallback loading failed: {e}")
        return None

print("--- Core model loading infrastructure is defined ---")

--- Core model loading infrastructure is defined ---


In [3]:
def discover_experiments_nb():
    print(f"--- Discovering experiments in: {ROOT_DIR} ---")
    if not os.path.exists(ROOT_DIR):
        print(f"ERROR: ROOT_DIR '{ROOT_DIR}' does not exist!")
        return pd.DataFrame()
    
    discovered_experiments = []
    for cat_name in os.listdir(ROOT_DIR):
        cat_path = os.path.join(ROOT_DIR, cat_name)
        if not os.path.isdir(cat_path): continue
        if cat_name.endswith(('_trt', 'tensorrt')):
            print(f"Skipping TensorRT directory: {cat_name}")
            continue

        for exp_name in os.listdir(cat_path):
            exp_path_str = os.path.join(cat_path, exp_name)
            if not os.path.isdir(exp_path_str): continue
            
            base_arch = "ResNet50" if "resnet50" in exp_name.lower() else "ResNet18" if "resnet18" in exp_name.lower() else "Unknown"
            model_file = get_model_file_path_nb(exp_path_str)
            log_path = os.path.join(exp_path_str, "log.json")
            
            is_structured = "pruning_structured" in cat_name.lower()
            base_exp_name_iter, stage_num_iter = None, None
            if is_structured and ("iterative" in cat_name.lower() or "_it_" in exp_name.lower() or "_stage" in exp_name.lower()):
                match = re.search(r"(.+?)(?:_|-)(?:stage|s)(\d+)", exp_name.lower())
                if match:
                    base_exp_name_iter = match.group(1)
                    stage_num_iter = int(match.group(2))

            num_classes = DEFAULT_NUM_CLASSES
            if os.path.exists(log_path):
                try:
                    with open(log_path, 'r') as f: log_data = json.load(f)
                    num_classes = log_data.get('config_details', {}).get('num_classes', DEFAULT_NUM_CLASSES)
                except Exception: pass
            
            exp_data = {
                "Experiment_ID": exp_name, "Experiment_Path": exp_path_str, "Log_Path": log_path,
                "Model_File_Path": model_file, "Base_Model_Arch": base_arch, "Num_Classes": num_classes,
                "Is_Structured_Pruning": is_structured,
                "Base_Exp_Name_Iterative": base_exp_name_iter, "Stage_Num_Iterative": stage_num_iter
            }
            discovered_experiments.append(exp_data)

    df = pd.DataFrame(discovered_experiments)
    if not df.empty:
        df['Stage_Num_Iterative'] = pd.to_numeric(df['Stage_Num_Iterative'], errors='coerce')
        df = df.set_index("Experiment_ID", drop=False)
    print(f"--- Discovery finished. Found {len(df)} non-TensorRT experiments. ---")
    return df

# Initialize/Re-initialize the global DataFrame
results_df = discover_experiments_nb()

--- Discovering experiments in: saved_models_and_logs ---
Skipping TensorRT directory: kd_tensorrt
Skipping TensorRT directory: tensorrt
--- Discovery finished. Found 23 non-TensorRT experiments. ---


In [4]:
# ===================================================================
#                      BENCHMARKING UTILITIES
# ===================================================================
def warm_up_model(model, input_tensor, device, num_warmup=NUM_WARMUP_RUNS):
    """Warms up the model for stable performance measurements."""
    model.to(device)
    input_tensor = input_tensor.to(device)
    with torch.no_grad():
        for _ in range(num_warmup):
            _ = model(input_tensor)
        if device.type == 'cuda':
            torch.cuda.synchronize()

def benchmark_gpu(model, input_tensor_cpu, num_runs=NUM_BENCHMARK_RUNS, model_name="Model"):
    """
    Performs a detailed benchmark on the GPU, returning detailed statistics.
    (This is your provided function, slightly adapted).
    """
    print(f"  -> Benchmarking '{model_name}' on GPU...")
    device_obj = torch.device("cuda")
    input_tensor = input_tensor_cpu.clone().to(device_obj)
    model.to(device_obj)

    # FP16 handling is not in your original file list, but this is good practice
    if 'FP16' in model_name.upper():
        print("     ... converting model and input to FP16.")
        model.half()
        input_tensor = input_tensor.half()

    warm_up_model(model, input_tensor, device_obj)
    torch.cuda.reset_peak_memory_stats(device_obj)
    
    times = []
    with torch.no_grad():
        for _ in range(num_runs):
            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)
            start_event.record()
            _ = model(input_tensor)
            end_event.record()
            torch.cuda.synchronize(device_obj)
            times.append(start_event.elapsed_time(end_event)) # in milliseconds
            
    mem_used = torch.cuda.max_memory_allocated(device_obj) / 1024**2
    times_ms = np.array(times)
    stats = {
        'mean_ms': np.mean(times_ms), 
        'std_ms': np.std(times_ms), 
        'median_ms': np.median(times_ms), 
        'throughput_fps': (1000 / np.mean(times_ms)) * input_tensor.shape[0], 
        'memory_mb': mem_used
    }
    return stats

def benchmark_cpu(model, input_tensor_cpu, num_runs=NUM_BENCHMARK_RUNS, model_name="Model"):
    """
    Performs a detailed benchmark on the CPU, mirroring the GPU function's output.
    """
    print(f"  -> Benchmarking '{model_name}' on CPU...")
    device_obj = torch.device("cpu")
    # Ensure model and tensor are float32 for CPU
    model.to(device_obj).float()
    input_tensor = input_tensor_cpu.clone().to(device_obj)

    warm_up_model(model, input_tensor, device_obj)
    
    times = []
    with torch.no_grad():
        for _ in range(num_runs):
            start_time = time.perf_counter()
            _ = model(input_tensor)
            end_time = time.perf_counter()
            times.append((end_time - start_time) * 1000) # convert to ms
            
    times_ms = np.array(times)
    stats = {
        'mean_ms': np.mean(times_ms), 
        'std_ms': np.std(times_ms), 
        'median_ms': np.median(times_ms), 
        'throughput_fps': (1000 / np.mean(times_ms)) * input_tensor.shape[0], 
        'memory_mb': "N/A" # Peak memory not easily tracked on CPU
    }
    return stats

print("--- Advanced benchmarking utilities are defined ---")

--- Advanced benchmarking utilities are defined ---


In [5]:
if not results_df.empty:
    print(f"\n{'='*20} RUNNING BENCHMARK WITH INPUT SHAPE: {DUMMY_INPUT_SHAPE} {'='*20}")
    
    all_benchmark_results = []

    for exp_id, row in results_df.iterrows():
        print(f"\n--- Processing: {exp_id} ---")
        
        # --- Run CPU Benchmark ---
        try:
            model_cpu = load_model_for_experiment_nb(row, results_df, target_device_str='cpu')
            if model_cpu:
                cpu_stats = benchmark_cpu(model_cpu, INPUT_TENSOR_CPU, model_name=exp_id)
                cpu_stats['Experiment_ID'] = exp_id
                cpu_stats['device'] = 'CPU'
                all_benchmark_results.append(cpu_stats)
                del model_cpu
                gc.collect()
            else:
                print("     ... skipping CPU benchmark due to load failure.")
        except Exception as e:
            print(f"     ERROR during CPU benchmark for {exp_id}: {e}")

        # --- Run GPU Benchmark ---
        if DEVICE.type == 'cuda':
            if exp_id in GPU_UNSTABLE_QUANTIZED_MODELS:
                print("  -> Skipping GPU benchmark (known unstable model).")
                continue
            try:
                # Reload model for GPU to ensure clean state and correct device placement
                model_gpu = load_model_for_experiment_nb(row, results_df, target_device_str='cuda')
                if model_gpu:
                    gpu_stats = benchmark_gpu(model_gpu, INPUT_TENSOR_CPU, model_name=exp_id)
                    gpu_stats['Experiment_ID'] = exp_id
                    gpu_stats['device'] = 'GPU'
                    all_benchmark_results.append(gpu_stats)
                    del model_gpu
                    torch.cuda.empty_cache()
                    gc.collect()
                else:
                    print("     ... skipping GPU benchmark due to load failure.")
            except Exception as e:
                print(f"     ERROR during GPU benchmark for {exp_id}: {e}")
                
    # Create the final DataFrame from all collected results
    final_results_df = pd.DataFrame(all_benchmark_results)
    print("\n--- All Benchmarking Finished ---")

else:
    print("DataFrame is empty. Nothing to benchmark.")
    final_results_df = pd.DataFrame()



--- Processing: resnet18pretrained_distilled_quant_kmeans_256clusters_post ---
  -> Benchmarking 'resnet18pretrained_distilled_quant_kmeans_256clusters_post' on CPU...


  state_dict = torch.load(model_path, map_location=device_to_load_on)


  -> Benchmarking 'resnet18pretrained_distilled_quant_kmeans_256clusters_post' on GPU...

--- Processing: resnet18pretrained_distilled_quant_ptq_int8_perchannel_post ---
  -> Benchmarking 'resnet18pretrained_distilled_quant_ptq_int8_perchannel_post' on CPU...
  -> Skipping GPU benchmark (known unstable model).

--- Processing: resnet18pretrained_distilled_quant_ptq_int8_pertensor_post ---
  -> Benchmarking 'resnet18pretrained_distilled_quant_ptq_int8_pertensor_post' on CPU...
  -> Skipping GPU benchmark (known unstable model).

--- Processing: resnet18pretrained_distilled_quant_qat_int8_epochs8 ---
  -> Benchmarking 'resnet18pretrained_distilled_quant_qat_int8_epochs8' on CPU...
  -> Skipping GPU benchmark (known unstable model).

--- Processing: resnet50_to_resnet18pretrained_kd ---
      ERROR (resnet50_to_resnet18pretrained_kd): Fallback loading failed: Error(s) in loading state_dict for ResNet:
	Missing key(s) in state_dict: "layer1.0.conv3.weight", "layer1.0.bn3.weight", "layer1.0

In [7]:
# ===================================================================
#                      CELL 6 (CORRECTED)
# ===================================================================
if not final_results_df.empty:
    print("\n--- Final Benchmark Results ---")
    
    # Merge with original data to get Base_Model_Arch
    # FIX: Use left_on and right_index to resolve the ambiguity
    final_df_merged = pd.merge(
        final_results_df,
        results_df[['Base_Model_Arch']], # Only need the column we want to add
        left_on='Experiment_ID',
        right_index=True,
        how='left'
    )

    # Define the columns we want in our final output and their order
    final_columns = [
        'Experiment_ID', 'Base_Model_Arch', 'device', 'mean_ms', 'median_ms', 'std_ms',
        'throughput_fps', 'memory_mb'
    ]
    
    final_df_display = final_df_merged[final_columns].copy()

    # Convert timing columns to numeric, making errors into 'NaN'
    for col in ['mean_ms', 'median_ms', 'std_ms', 'throughput_fps', 'memory_mb']:
        final_df_display[col] = pd.to_numeric(final_df_display[col], errors='coerce')

    # Set display format for better readability
    pd.options.display.float_format = '{:.3f}'.format
    
    # Sort by device, then by performance for easier comparison
    display(final_df_display.sort_values(by=['device', 'throughput_fps'], ascending=[True, False]))
    
    # Save to CSV
    try:
        final_df_display.to_csv(OUTPUT_CSV_NB, index=False, float_format='%.5f')
        print(f"\n--- Benchmark summary saved to {OUTPUT_CSV_NB} ---")
    except Exception as e_csv:
        print(f"Error saving CSV: {e_csv}")
else:
    print("No benchmark results were generated. Nothing to save.")

print("\n--- Notebook processing finished ---")


--- Final Benchmark Results ---


Unnamed: 0,Experiment_ID,Base_Model_Arch,device,mean_ms,median_ms,std_ms,throughput_fps,memory_mb
11,resnet50_prune_struct_it_l1filter_stage3_appro...,ResNet50,CPU,373.5,373.5,0.667,85.676,
17,resnet50_prune_struct_os_l1filter_fp70_ft,ResNet50,CPU,373.94,373.94,0.462,85.575,
3,resnet18pretrained_distilled_quant_ptq_int8_pe...,ResNet18,CPU,620.041,620.041,22.564,51.609,
2,resnet18pretrained_distilled_quant_ptq_int8_pe...,ResNet18,CPU,626.923,626.923,25.852,51.043,
4,resnet18pretrained_distilled_quant_qat_int8_ep...,ResNet18,CPU,639.53,639.53,23.374,50.037,
15,resnet50_prune_struct_os_l1filter_fp55_ft,ResNet50,CPU,669.311,669.311,1.226,47.81,
9,resnet50_prune_struct_it_l1filter_stage2_appro...,ResNet50,CPU,722.171,722.171,18.843,44.311,
0,resnet18pretrained_distilled_quant_kmeans_256c...,ResNet18,CPU,795.614,795.614,4.292,40.221,
13,resnet50_prune_struct_os_l1filter_fp30_ft,ResNet50,CPU,1231.045,1231.045,12.973,25.994,
7,resnet50_prune_struct_it_l1filter_stage1_appro...,ResNet50,CPU,1232.246,1232.246,3.246,25.969,



--- Benchmark summary saved to model_advanced_inference_benchmark.csv ---

--- Notebook processing finished ---
