In [None]:
# Part 1: Setup

import os
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
import json
from tqdm.notebook import tqdm
from shapely.geometry import Polygon
from shapely.ops import unary_union
from skimage.metrics import (
    mean_squared_error, 
    structural_similarity as ssim, 
    peak_signal_noise_ratio
)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from multiprocessing import Pool, cpu_count
warnings.filterwarnings('ignore')

# Configuration Parameters
PARCEL_CONFIG = {
    # Core Parameters
    'MIN_AREA': 50,                    # Minimum area for a parcel to be considered valid
    'COLOR_DIST_THRESHOLD': 30,        # Maximum color distance for matching
    'WIN_SIZE_FOR_SSIM': 3,            # Window size for SSIM calculation
    
    # Processing Parameters
    'CHUNK_SIZE': 100,                 # Size of chunks for processing large images
    'CONTOUR_EPSILON_FACTOR': 0.02,    # Factor for polygon approximation
    
    # Visualization Parameters
    'SAVE_VISUALIZATIONS': True,       # Whether to save comparison visualizations
    'DPI': 150,                        # DPI for saved visualizations
    'FIGURE_SIZE': (12, 6),            # Size of comparison figures
    'POLYGON_LINE_WIDTH': 1,           # Width of polygon outlines in visualizations
    'REAL_POLYGON_COLOR': 'g',         # Color for real polygons
    'FAKE_POLYGON_COLOR': 'r'          # Color for fake polygons
}

# Directory Evaluation Configuration
DIR_EVAL_CONFIG = {
    # File Pattern Settings
    'REAL_SUFFIX': '.jpg',       # Updated for your parcels files
    'FAKE_SUFFIX': '_fake_B.png', # Suffix for your generated files
    'DEFAULT_PATTERN': '*.jpg',   # Default pattern for ground truth files
    
    # Output Settings
    'DEFAULT_BENCHMARK_DIR': 'benchmark-outputs',  # Default output directory
    'SAVE_DETAILED_METRICS': True,     # Whether to save per-image metrics
    'SAVE_SUMMARY_STATS': True,        # Whether to save summary statistics
    
    # Processing Settings
    'PARALLEL_PROCESSING': True,       # Whether to use parallel processing
    'NUM_WORKERS': min(8, cpu_count()), # Number of workers for parallel processing
    'BATCH_SIZE': 100,                 # Batch size for processing large datasets
    
    # Reporting Settings
    'REPORT_DECIMAL_PLACES': 4,        # Number of decimal places in reports
    'INCLUDE_TIMESTAMPS': True         # Whether to include timestamps in reports
}

# Multi-model Comparison Configuration
COMPARISON_CONFIG = {
    # Output Settings
    'DEFAULT_OUTPUT_DIR': 'benchmark-outputs/multi_model_comparison',
    'PLOTS_SUBDIRECTORY': 'comparative_plots',
    'REPORTS_SUBDIRECTORY': 'reports',
    
    # Visualization Settings
    'PLOT_DPI': 300,
    'BAR_PLOT_SIZE': (12, 6),
    'HEATMAP_SIZE': (15, 10),
    'VIOLIN_PLOT_SIZE': (12, 6),
    'COLOR_PALETTE': 'husl',          # seaborn color palette
    'PLOT_GRID_STYLE': '--',
    'PLOT_GRID_ALPHA': 0.7,
    
    # Metrics Configuration
    'KEY_METRICS': {
        'mean_iou': 'Mean IoU',
        'ssim': 'SSIM',
        'psnr': 'PSNR',
        'polygon_count_ratio': 'Polygon Count Ratio',
        'mean_area_ratio': 'Mean Area Ratio'
    },
    
    # Metrics where higher values are better
    'HIGHER_BETTER_METRICS': [
        'ssim', 
        'psnr', 
        'mean_iou'
    ],
    
    'REPORT_DECIMAL_PLACES': 4,        # Number of decimal places in reports
    'INCLUDE_TIMESTAMPS': True         # Whether to include timestamps in reports
}

In [None]:
# Part 2: Utility functions for directory inspection and file matching

def inspect_directory_contents(directory_path, pattern="*"):
    """Print sample filenames in a directory to help with debugging."""
    directory = Path(directory_path)
    files = list(directory.glob(pattern))
    
    print(f"Directory: {directory}")
    print(f"Total files matching '{pattern}': {len(files)}")
    if files:
        print("Sample filenames:")
        for file in files[:5]:  # Show first 5 files
            print(f"  {file.name}")
    else:
        print("No files found matching the pattern")
    print()

def find_matching_files(real_dir, fake_dir, real_suffix='.jpg', fake_suffix='_fake_B.png'):
    """
    Find matching files between real and fake directories with custom naming.
    
    Returns a list of tuples (real_path, fake_path) for matched files.
    """
    real_dir = Path(real_dir)
    fake_dir = Path(fake_dir)
    
    # Get all real files (from ground truth parcels directory)
    real_files = sorted(real_dir.glob(f"*{real_suffix}"))
    matched_pairs = []
    
    # Check if we're dealing with monolithic model output 
    # (which has both real_A and fake_B files in the same directory)
    if any(f.name.endswith('_fake_B.png') for f in fake_dir.glob('*_fake_B.png')):
        print(f"Detected monolithic model output in {fake_dir}")
        # For monolithic model outputs, only select the fake_B files
        fake_files = sorted(fake_dir.glob(f"*{fake_suffix}"))
        
        for real_file in real_files:
            # Extract the numeric part from the filename (e.g., '000006' from 'parcels_000006.jpg')
            match_id = real_file.stem.split('_')[-1]
            
            # Look for corresponding fake file (e.g., 'combined_000006_fake_B.png')
            fake_file = fake_dir / f"combined_{match_id}_fake_B.png"
            
            if fake_file.exists():
                matched_pairs.append((real_file, fake_file))
    else:
        # For parcels/voronoi/ensemble folders (all have simple jpg files)
        for real_file in real_files:
            # Extract the numeric part from the filename (e.g., '000006' from 'parcels_000006.jpg')
            match_id = real_file.stem.split('_')[-1]
            
            # Construct the corresponding fake filename based on directory type
            if 'voronoi' in str(fake_dir).lower():
                fake_file = fake_dir / f"voronoi_{match_id}.jpg"
            elif 'ensemble' in str(fake_dir).lower():
                fake_file = fake_dir / f"ensemble_stable-diffusion_{match_id}.jpg"
            else:
                # Default case
                fake_file = fake_dir / f"combined_{match_id}{fake_suffix}"
            
            if fake_file.exists():
                matched_pairs.append((real_file, fake_file))
    
    if not matched_pairs:
        print(f"Warning: No matching files found between {real_dir} and {fake_dir}")
    else:
        print(f"Found {len(matched_pairs)} matching pairs between {real_dir} and {fake_dir}")
    
    return matched_pairs

In [None]:
# Part 3: Visualization and comparison functions

def create_comparison_plots(model_results, output_dir):
    """Create comparative visualizations of model performances."""
    # Get consistent colors for models
    colors = sns.color_palette(COMPARISON_CONFIG['COLOR_PALETTE'], 
                             n_colors=len(model_results))
    model_colors = dict(zip(model_results.keys(), colors))
    
    # Create individual metric comparisons
    for metric, metric_label in COMPARISON_CONFIG['KEY_METRICS'].items():
        # Bar plot
        plt.figure(figsize=COMPARISON_CONFIG['BAR_PLOT_SIZE'])
        values = [results['avg_metrics'][metric] for results in model_results.values()]
        models = list(model_results.keys())
        
        bars = plt.bar(models, values, color=[model_colors[model] for model in models])
        
        plt.title(f'Comparison of {metric_label}', pad=20)
        plt.xticks(rotation=45, ha='right')
        plt.ylabel(metric_label)
        
        # Add value labels
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.{COMPARISON_CONFIG["REPORT_DECIMAL_PLACES"]}f}',
                    ha='center', va='bottom')
        
        plt.grid(True, axis='y', linestyle=COMPARISON_CONFIG['PLOT_GRID_STYLE'], 
                alpha=COMPARISON_CONFIG['PLOT_GRID_ALPHA'])
        plt.tight_layout()
        plt.savefig(Path(output_dir) / f'{metric}_comparison.png', 
                   dpi=COMPARISON_CONFIG['PLOT_DPI'], 
                   bbox_inches='tight')
        plt.close()
        
        # Violin plot
        plt.figure(figsize=COMPARISON_CONFIG['VIOLIN_PLOT_SIZE'])
        data_dict = {model_name: results['metrics_df'][metric].values 
                    for model_name, results in model_results.items()}
        
        violin_parts = plt.violinplot([data_dict[model] for model in models],
                                    showmeans=True, showmedians=True)
        
        plt.title(f'Distribution of {metric_label} Across Images', pad=20)
        plt.xticks(range(1, len(models) + 1), models, rotation=45, ha='right')
        plt.ylabel(metric_label)
        plt.grid(True, axis='y', linestyle=COMPARISON_CONFIG['PLOT_GRID_STYLE'], 
                alpha=COMPARISON_CONFIG['PLOT_GRID_ALPHA'])
        
        plt.tight_layout()
        plt.savefig(Path(output_dir) / f'{metric}_distribution.png', 
                   dpi=COMPARISON_CONFIG['PLOT_DPI'], 
                   bbox_inches='tight')
        plt.close()
    
    # Create heatmap
    plt.figure(figsize=COMPARISON_CONFIG['HEATMAP_SIZE'])
    metrics_data = pd.DataFrame({
        model_name: results['avg_metrics'] 
        for model_name, results in model_results.items()
    })
    
    sns.heatmap(metrics_data, annot=True, 
                fmt=f'.{COMPARISON_CONFIG["REPORT_DECIMAL_PLACES"]}f', 
                cmap='YlOrRd',
                cbar_kws={'label': 'Metric Value'})
    plt.title('Model Comparison Heatmap')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(Path(output_dir) / 'metrics_heatmap.png', 
                dpi=COMPARISON_CONFIG['PLOT_DPI'], 
                bbox_inches='tight')
    plt.close()

def save_comparative_summary(model_results, output_dir):
    """Save comparative summary of all models."""
    summary = {
        'models_compared': list(model_results.keys()),
        'model_metrics': {
            model_name: results['avg_metrics'].to_dict()
            for model_name, results in model_results.items()
        }
    }
    
    if COMPARISON_CONFIG['INCLUDE_TIMESTAMPS']:
        summary['timestamp'] = datetime.now().isoformat()
    
    # Determine best model for each metric
    best_models = {}
    for metric in COMPARISON_CONFIG['KEY_METRICS'].keys():
        metric_values = {model: metrics[metric] 
                        for model, metrics in summary['model_metrics'].items()
                        if metric in metrics}
        
        if not metric_values:
            continue
            
        if metric in COMPARISON_CONFIG['HIGHER_BETTER_METRICS']:
            best_model = max(metric_values.items(), key=lambda x: x[1])
        else:  # For metrics where closer to 1.0 is better
            best_model = min(metric_values.items(), 
                           key=lambda x: abs(x[1] - 1.0))
        
        best_models[metric] = {
            'best_model': best_model[0],
            'value': round(best_model[1], COMPARISON_CONFIG['REPORT_DECIMAL_PLACES']),
            'all_values': {k: round(v, COMPARISON_CONFIG['REPORT_DECIMAL_PLACES']) 
                          for k, v in metric_values.items()}
        }
    
    summary['best_models'] = best_models
    
    # Calculate overall ranking
    model_scores = {model: 0 for model in model_results.keys()}
    for metric_result in best_models.values():
        model_scores[metric_result['best_model']] += 1
    
    summary['overall_ranking'] = dict(sorted(model_scores.items(), 
                                           key=lambda x: x[1], 
                                           reverse=True))
    
    # Save summary
    with open(Path(output_dir) / 'comparative_summary.json', 'w') as f:
        json.dump(summary, f, indent=4)
    
    # Print summary
    print("\nComparative Summary:")
    print("-" * 50)
    print("\nBest performing models by metric:")
    for metric, result in best_models.items():
        print(f"{metric}: {result['best_model']} "
              f"({result['value']:.{COMPARISON_CONFIG['REPORT_DECIMAL_PLACES']}f})")
    
    print("\nOverall Ranking (number of metrics won):")
    for model, score in summary['overall_ranking'].items():
        print(f"{model}: {score}")

In [None]:
# Part 4: Main evaluation functions

def create_comparison_plots(model_results, output_dir):
    import matplotlib.pyplot as plt
    # Use a default value if 'DECIMAL_PLACES' is not defined in COMPARISON_CONFIG
    decimal_places = COMPARISON_CONFIG.get("DECIMAL_PLACES", 4)
    model_colors = COMPARISON_CONFIG.get("MODEL_COLORS", {})
    for metric, metric_label in COMPARISON_CONFIG['KEY_METRICS'].items():
        plt.figure(figsize=COMPARISON_CONFIG['BAR_PLOT_SIZE'])
        values = [results['avg_metrics'][metric] for results in model_results.values()]
        models = list(model_results.keys())
        bars = plt.bar(models, values, color=[model_colors.get(model, None) for model in models])
        for bar in bars:
            height = bar.get_height()
            # Use the local variable "decimal_places" for formatting
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.{decimal_places}f}',
                     ha='center', va='bottom')
        plt.grid(True, axis='y', linestyle=COMPARISON_CONFIG['PLOT_GRID_STYLE'], 
                 alpha=COMPARISON_CONFIG['PLOT_GRID_ALPHA'])
        plt.tight_layout()
        plt.savefig(output_dir / f"{metric_label}_bar_plot.png")
        plt.close()

def save_comparative_summary(model_results, parent_dir):
    summary = {}
    for model_name, data in model_results.items():
        summary[model_name] = data['avg_metrics'].to_dict()
    summary_path = parent_dir / "comparative_summary.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=4)

def inspect_directory_contents(directory):
    from pathlib import Path
    p = Path(directory)
    files = list(p.glob("*"))
    print(f"Directory: {directory}")
    print(f"Total files matching '*': {len(files)}")
    sample_files = [f.name for f in files[:5]]
    print("Sample filenames:")
    for name in sample_files:
        print(f"  {name}")
    print("\n")

def evaluate_multiple_models(
    model_paths=None,
    benchmark_dir=COMPARISON_CONFIG['DEFAULT_OUTPUT_DIR'],
    ground_truth_dir=None
):
    if model_paths is None:
        model_paths = {
            'parcels': "./parcels",
            'voronoi': "./voronoi",
            'ensemble': "./ensemble",
            'monolithic': "./monolithic"
        }
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    parent_dir = Path(benchmark_dir) / f"comparison_{timestamp}"
    parent_dir.mkdir(parents=True, exist_ok=True)
    
    plots_dir = parent_dir / COMPARISON_CONFIG['PLOTS_SUBDIRECTORY']
    reports_dir = parent_dir / COMPARISON_CONFIG['REPORTS_SUBDIRECTORY']
    plots_dir.mkdir(exist_ok=True)
    reports_dir.mkdir(exist_ok=True)
    
    print("Inspecting directory contents:")
    for name, path in model_paths.items():
        inspect_directory_contents(path)
    if ground_truth_dir and ground_truth_dir not in model_paths.values():
        inspect_directory_contents(ground_truth_dir)
    
    reference_dir = ground_truth_dir
    if reference_dir is None and 'parcels' in model_paths:
        reference_dir = model_paths['parcels']
        model_paths_copy = model_paths.copy()
        del model_paths_copy['parcels']
        model_paths = model_paths_copy
    
    model_results = {}
    
    for model_name, base_path in model_paths.items():
        print(f"\nProcessing {model_name}...")
        base_path = Path(base_path)
        real_dir = reference_dir if reference_dir is not None else base_path
        
        try:
            if model_name == 'voronoi':
                real_suffix = '.jpg'
                fake_suffix = '.jpg'
            elif model_name == 'monolithic':
                real_suffix = '.jpg'
                fake_suffix = '_fake_B.png'
            else:
                real_suffix = '.jpg'
                fake_suffix = '.jpg'
            
            matched_pairs = find_matching_files(real_dir, base_path, real_suffix, fake_suffix)
            
            if not matched_pairs:
                raise ValueError(f"No matching files found for {model_name}")
                
            evaluator = ParcelEvaluator()
            
            benchmark_id = evaluator.create_benchmark_id(model_name)
            output_dir = evaluator.setup_output_directory(parent_dir / model_name, benchmark_id)
            evaluator.save_config(output_dir)
            
            all_results = []
            for real_file, fake_file in tqdm(matched_pairs, desc=f"Evaluating {model_name}"):
                try:
                    metrics, _ = evaluator.evaluate(
                        real_file, 
                        fake_file,
                        output_dir / "visualizations"
                    )
                    if metrics:
                        metrics['file_name'] = fake_file.name
                        all_results.append(metrics)
                except Exception as e:
                    print(f"Error processing {fake_file.name}: {str(e)}")
            
            if not all_results:
                raise ValueError(f"No successful evaluations for {model_name}")
                
            df = pd.DataFrame(all_results)
            df.to_csv(output_dir / "metrics" / "detailed_metrics.csv", index=False)
            
            avg_metrics = df.mean(numeric_only=True)
            summary_dict = {
                'model_name': model_name,
                'average_metrics': avg_metrics.to_dict(),
                'total_images_processed': len(all_results),
                'timestamp': datetime.now().isoformat()
            }
            
            with open(output_dir / "metrics" / "summary.json", 'w') as f:
                json.dump(summary_dict, f, indent=4)
            
            model_results[model_name] = {
                'metrics_df': df,
                'benchmark_id': benchmark_id,
                'avg_metrics': avg_metrics,
            }
            
        except Exception as e:
            print(f"Error evaluating model {model_name}: {str(e)}")
            continue
    
    if not model_results:
        print("No models were successfully evaluated.")
        return {}, parent_dir
    
    create_comparison_plots(model_results, plots_dir)
    save_comparative_summary(model_results, parent_dir)
    
    return model_results, parent_dir

def run_custom_benchmark():
    input_paths = {
        'parcels': "/home/ls/sites/re-blocking/image-generation/brooklyn_comparison/parcels",
        'voronoi': "/home/ls/sites/re-blocking/image-generation/brooklyn_comparison/voronoi",
        'ensemble': "/home/ls/sites/re-blocking/ensemble-model/ensemble-output/stable-diffusion",
        'monolithic': "/home/ls/sites/re-blocking/data/results/combined-p2p-500-150-v100/test_latest/images"
    }
    
    output_dir = "parcel_benchmark_results"
    
    results, benchmark_dir = evaluate_multiple_models(
        model_paths=input_paths,
        benchmark_dir=output_dir
    )
    
    print(f"\nBenchmark results saved to: {benchmark_dir}")
    return results, benchmark_dir

if __name__ == "__main__":
    results, output_path = run_custom_benchmark()
    print("\nSummary of model performance:")
    for model_name, model_data in results.items():
        print(f"\n{model_name}:")
        for metric, value in model_data['avg_metrics'].items():
            if isinstance(value, (int, float)):
                print(f"  {metric}: {value:.4f}")


In [None]:
# Part 5: Directory Evaluation and Batch Processing

class ParcelEvaluator:
    def __init__(
        self,
        min_area=PARCEL_CONFIG['MIN_AREA'],
        color_dist_threshold=PARCEL_CONFIG['COLOR_DIST_THRESHOLD'],
        win_size_for_ssim=PARCEL_CONFIG['WIN_SIZE_FOR_SSIM'],
        save_visualizations=PARCEL_CONFIG['SAVE_VISUALIZATIONS']
    ):
        """Initialize the ParcelEvaluator with configuration parameters."""
        self.min_area = min_area
        self.color_dist_threshold = color_dist_threshold
        self.win_size_for_ssim = win_size_for_ssim
        self.save_visualizations = save_visualizations
        self.chunk_size = PARCEL_CONFIG['CHUNK_SIZE']
        self.contour_epsilon_factor = PARCEL_CONFIG['CONTOUR_EPSILON_FACTOR']

    def create_benchmark_id(self, model_name=None):
        """Create a unique identifier for this benchmark run."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        if model_name:
            return f"{model_name}_{timestamp}"
        return f"benchmark_{timestamp}"

    def setup_output_directory(self, base_dir, benchmark_id):
        """Create and setup output directory structure."""
        output_dir = Path(base_dir) / benchmark_id
        
        # Create subdirectories
        (output_dir / "visualizations").mkdir(parents=True, exist_ok=True)
        (output_dir / "metrics").mkdir(parents=True, exist_ok=True)
        
        return output_dir

    def save_config(self, output_dir):
        """Save evaluator configuration."""
        config = {
            'min_area': self.min_area,
            'color_dist_threshold': self.color_dist_threshold,
            'win_size_for_ssim': self.win_size_for_ssim,
            'timestamp': datetime.now().isoformat()
        }
        
        with open(output_dir / "metrics" / "config.json", 'w') as f:
            json.dump(config, f, indent=4)
    
    def evaluate(self, real_file, fake_file, visualization_dir):
        """
        Evaluate a pair of images and return computed metrics.
        
        Parameters:
            real_file (Path): Path to the ground truth image.
            fake_file (Path): Path to the generated or modified image.
            visualization_dir (Path): Directory to save any output visualizations.
        
        Returns:
            metrics (dict): A dictionary of computed metrics.
            additional_info (Any): Additional information (or None if not applicable).
        """
        # Dummy logic: Replace this with your actual evaluation logic.
        # Now returning a 'mean_iou' key so that downstream code finds it.
        metrics = {"dummy_metric": 0.0, "mean_iou": 0.0}
        return metrics, None

def evaluate_directory(
    real_dir, 
    fake_dir, 
    benchmark_dir=DIR_EVAL_CONFIG['DEFAULT_BENCHMARK_DIR'], 
    model_name=None, 
    **kwargs
):
    """Evaluate all matching images in directories with organized outputs."""
    # Initialize evaluator
    evaluator = ParcelEvaluator(**kwargs)
    
    # Create benchmark ID and setup directories
    benchmark_id = evaluator.create_benchmark_id(model_name)
    output_dir = evaluator.setup_output_directory(benchmark_dir, benchmark_id)
    
    # Save configuration
    evaluator.save_config(output_dir)
    
    # Find matching file pairs
    matched_pairs = find_matching_files(real_dir, fake_dir)
    
    if not matched_pairs:
        raise ValueError(f"No matching image pairs found in {real_dir} and {fake_dir}")
    
    print(f"Found {len(matched_pairs)} matching image pairs")
    
    # Process in batches to manage memory
    batch_size = DIR_EVAL_CONFIG['BATCH_SIZE']
    num_batches = (len(matched_pairs) + batch_size - 1) // batch_size
    
    all_results = []
    
    if DIR_EVAL_CONFIG['PARALLEL_PROCESSING'] and num_batches > 1:
        # Parallel processing for large datasets
        print(f"Processing {len(matched_pairs)} images in parallel with {DIR_EVAL_CONFIG['NUM_WORKERS']} workers")
        
        # Prepare worker arguments
        worker_args = [
            (evaluator, real_file, fake_file, output_dir / "visualizations") 
            for real_file, fake_file in matched_pairs
        ]
        
        # Process in parallel
        with Pool(DIR_EVAL_CONFIG['NUM_WORKERS']) as pool:
            results = list(tqdm(
                pool.imap(process_image_worker, worker_args),
                total=len(matched_pairs),
                desc=f"Processing {model_name if model_name else 'images'}"
            ))
            
            # Filter out None results (failed processing)
            all_results = [r for r in results if r is not None]
    else:
        # Sequential processing or small dataset
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(matched_pairs))
            
            batch_pairs = matched_pairs[start_idx:end_idx]
            print(f"\nProcessing batch {batch_idx+1}/{num_batches}: {len(batch_pairs)} images")
            
            # Process this batch
            batch_results = []
            for real_file, fake_file in tqdm(batch_pairs, desc=f"Batch {batch_idx+1}"):
                try:
                    metrics, _ = evaluator.evaluate(
                        real_file, 
                        fake_file, 
                        output_dir / "visualizations"
                    )
                    
                    if metrics:
                        metrics['file_name'] = fake_file.name
                        batch_results.append(metrics)
                        
                except Exception as e:
                    print(f"Error processing {fake_file.name}: {str(e)}")
            
            all_results.extend(batch_results)
            
            # Save batch results
            if DIR_EVAL_CONFIG['SAVE_DETAILED_METRICS'] and batch_results:
                batch_df = pd.DataFrame(batch_results)
                batch_df.to_csv(output_dir / "metrics" / f"batch_{batch_idx+1}_metrics.csv", index=False)
    
    # Create summary DataFrame
    df = pd.DataFrame(all_results)
    
    if DIR_EVAL_CONFIG['SAVE_DETAILED_METRICS']:
        df.to_csv(output_dir / "metrics" / "detailed_metrics.csv", index=False)
    
    if DIR_EVAL_CONFIG['SAVE_SUMMARY_STATS']:
        # Compute and save summary statistics
        summary_stats = df.describe()
        summary_stats.to_csv(output_dir / "metrics" / "summary_statistics.csv")
        
        # Save averages in a more readable format
        avg_metrics = df.mean(numeric_only=True).round(DIR_EVAL_CONFIG['REPORT_DECIMAL_PLACES'])
        summary_dict = {
            'model_name': model_name,
            'average_metrics': avg_metrics.to_dict(),
            'total_images_processed': len(all_results),
            'successful_evaluations': len(all_results),
            'failed_evaluations': len(matched_pairs) - len(all_results)
        }
        
        if DIR_EVAL_CONFIG['INCLUDE_TIMESTAMPS']:
            summary_dict['timestamp'] = datetime.now().isoformat()
    
        with open(output_dir / "metrics" / "summary.json", 'w') as f:
            json.dump(summary_dict, f, indent=4)
    
    print(f"\nResults saved to: {output_dir}")
    print("\nAverage Metrics:")
    print("-" * 50)
    for metric, value in avg_metrics.items():
        if isinstance(value, (int, float)):
            print(f"{metric}: {value:.{DIR_EVAL_CONFIG['REPORT_DECIMAL_PLACES']}f}")
    
    return df, benchmark_id

In [None]:
# Part 6: Multi-Model Comparison and Main Entry Point

def create_comparison_plots(model_results, output_dir):
    """Create comparative visualizations of model performances."""
    # Get consistent colors for models
    colors = sns.color_palette(COMPARISON_CONFIG['COLOR_PALETTE'], 
                             n_colors=len(model_results))
    model_colors = dict(zip(model_results.keys(), colors))
    
    # Create individual metric comparisons
    for metric, metric_label in COMPARISON_CONFIG['KEY_METRICS'].items():
        # Bar plot
        plt.figure(figsize=COMPARISON_CONFIG['BAR_PLOT_SIZE'])
        values = [results['avg_metrics'][metric] for results in model_results.values()]
        models = list(model_results.keys())
        
        bars = plt.bar(models, values, color=[model_colors[model] for model in models])
        
        plt.title(f'Comparison of {metric_label}', pad=20)
        plt.xticks(rotation=45, ha='right')
        plt.ylabel(metric_label)
        
        # Add value labels
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.{COMPARISON_CONFIG["DECIMAL_PLACES"]}f}',
                    ha='center', va='bottom')
        
        plt.grid(True, axis='y', linestyle=COMPARISON_CONFIG['PLOT_GRID_STYLE'], 
                alpha=COMPARISON_CONFIG['PLOT_GRID_ALPHA'])
        plt.tight_layout()
        plt.savefig(Path(output_dir) / f'{metric}_comparison.png', 
                   dpi=COMPARISON_CONFIG['PLOT_DPI'], 
                   bbox_inches='tight')
        plt.close()
        
        # Violin plot
        plt.figure(figsize=COMPARISON_CONFIG['VIOLIN_PLOT_SIZE'])
        data_dict = {model_name: results['metrics_df'][metric].values 
                    for model_name, results in model_results.items()}
        
        violin_parts = plt.violinplot([data_dict[model] for model in models],
                                    showmeans=True, showmedians=True)
        
        plt.title(f'Distribution of {metric_label} Across Images', pad=20)
        plt.xticks(range(1, len(models) + 1), models, rotation=45, ha='right')
        plt.ylabel(metric_label)
        plt.grid(True, axis='y', linestyle=COMPARISON_CONFIG['PLOT_GRID_STYLE'], 
                alpha=COMPARISON_CONFIG['PLOT_GRID_ALPHA'])
        
        plt.tight_layout()
        plt.savefig(Path(output_dir) / f'{metric}_distribution.png', 
                   dpi=COMPARISON_CONFIG['PLOT_DPI'], 
                   bbox_inches='tight')
        plt.close()
    
    # Create heatmap
    plt.figure(figsize=COMPARISON_CONFIG['HEATMAP_SIZE'])
    metrics_data = pd.DataFrame({
        model_name: results['avg_metrics'] 
        for model_name, results in model_results.items()
    })
    
    sns.heatmap(metrics_data, annot=True, 
                fmt=f'.{COMPARISON_CONFIG["DECIMAL_PLACES"]}f', 
                cmap='YlOrRd',
                cbar_kws={'label': 'Metric Value'})
    plt.title('Model Comparison Heatmap')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(Path(output_dir) / 'metrics_heatmap.png', 
                dpi=COMPARISON_CONFIG['PLOT_DPI'], 
                bbox_inches='tight')
    plt.close()

def save_comparative_summary(model_results, output_dir):
    """Save comparative summary of all models."""
    summary = {
        'models_compared': list(model_results.keys()),
        'model_metrics': {
            model_name: results['avg_metrics'].to_dict()
            for model_name, results in model_results.items()
        }
    }
    
    if COMPARISON_CONFIG['INCLUDE_TIMESTAMPS']:
        summary['timestamp'] = datetime.now().isoformat()
    
    # Determine best model for each metric
    best_models = {}
    for metric in COMPARISON_CONFIG['KEY_METRICS'].keys():
        metric_values = {model: metrics[metric] 
                        for model, metrics in summary['model_metrics'].items()
                        if metric in metrics}
        
        if not metric_values:
            continue
            
        if metric in COMPARISON_CONFIG['HIGHER_BETTER_METRICS']:
            best_model = max(metric_values.items(), key=lambda x: x[1])
        else:  # For metrics where closer to 1.0 is better
            best_model = min(metric_values.items(), 
                           key=lambda x: abs(x[1] - 1.0))
        
        best_models[metric] = {
            'best_model': best_model[0],
            'value': round(best_model[1], COMPARISON_CONFIG['DECIMAL_PLACES']),
            'all_values': {k: round(v, COMPARISON_CONFIG['DECIMAL_PLACES']) 
                          for k, v in metric_values.items()}
        }
    
    summary['best_models'] = best_models
    
    # Calculate overall ranking
    model_scores = {model: 0 for model in model_results.keys()}
    for metric_result in best_models.values():
        model_scores[metric_result['best_model']] += 1
    
    summary['overall_ranking'] = dict(sorted(model_scores.items(), 
                                           key=lambda x: x[1], 
                                           reverse=True))
    
    # Save summary
    with open(Path(output_dir) / 'comparative_summary.json', 'w') as f:
        json.dump(summary, f, indent=4)
    
    # Print summary
    print("\nComparative Summary:")
    print("-" * 50)
    print("\nBest performing models by metric:")
    for metric, result in best_models.items():
        print(f"{metric}: {result['best_model']} "
              f"({result['value']:.{COMPARISON_CONFIG['DECIMAL_PLACES']}f})")
    
    print("\nOverall Ranking (number of metrics won):")
    for model, score in summary['overall_ranking'].items():
        print(f"{model}: {score}")

def find_matching_files(real_dir, fake_dir, real_suffix='.jpg', fake_suffix='_fake_B.png'):
    """
    Find matching files between real and fake directories with custom naming.
    
    Returns a list of tuples (real_path, fake_path) for matched files.
    """
    real_dir = Path(real_dir)
    fake_dir = Path(fake_dir)
    
    # Get all real files (from ground truth parcels directory)
    real_files = sorted(real_dir.glob(f"*{real_suffix}"))
    matched_pairs = []
    
    # Check if we're dealing with monolithic model output 
    # (which has both real_A and fake_B files in the same directory)
    if any(f.name.endswith('_fake_B.png') for f in fake_dir.glob('*_fake_B.png')):
        print(f"Detected monolithic model output in {fake_dir}")
        # For monolithic model outputs, only select the fake_B files
        fake_files = sorted(fake_dir.glob(f"*{fake_suffix}"))
        
        for real_file in real_files:
            # Extract the numeric part from the filename (e.g., '000006' from 'parcels_000006.jpg')
            match_id = real_file.stem.split('_')[-1]
            
            # Look for corresponding fake file (e.g., 'combined_000006_fake_B.png')
            fake_file = fake_dir / f"combined_{match_id}_fake_B.png"
            
            if fake_file.exists():
                matched_pairs.append((real_file, fake_file))
    else:
        # For parcels/voronoi/ensemble folders (all have simple jpg files)
        for real_file in real_files:
            # Extract the numeric part from the filename (e.g., '000006' from 'parcels_000006.jpg')
            match_id = real_file.stem.split('_')[-1]
            
            # Construct the corresponding fake filename based on directory type
            if 'voronoi' in str(fake_dir).lower():
                fake_file = fake_dir / f"voronoi_{match_id}.jpg"
            elif 'ensemble' in str(fake_dir).lower():
                fake_file = fake_dir / f"ensemble_stable-diffusion_{match_id}.jpg"
            else:
                # Default case
                fake_file = fake_dir / f"combined_{match_id}{fake_suffix}"
            
            if fake_file.exists():
                matched_pairs.append((real_file, fake_file))
    
    if not matched_pairs:
        print(f"Warning: No matching files found between {real_dir} and {fake_dir}")
    else:
        print(f"Found {len(matched_pairs)} matching pairs between {real_dir} and {fake_dir}")
    
    return matched_pairs

def evaluate_multiple_models(
    model_paths=None,
    benchmark_dir=COMPARISON_CONFIG['DEFAULT_OUTPUT_DIR'],
    ground_truth_dir=None
):
    """
    Evaluate multiple models and compare their performance.
    
    Parameters:
    -----------
    model_paths : dict
        Dictionary mapping model names to their output directories
    benchmark_dir : str
        Directory to save results
    ground_truth_dir : str
        Optional path to ground truth files (if different from model directories)
    
    Returns:
    --------
    model_results : dict
        Dictionary with results for each model
    parent_dir : Path
        Path to the output directory
    """
    if model_paths is None:
        # Default paths for your custom structure
        model_paths = {
            'parcels': "./parcels",         # Ground truth
            'voronoi': "./voronoi",         # Voronoi method
            'ensemble': "./ensemble",       # Ensemble output
            'monolithic': "./monolithic"    # Monolithic output
        }
    
    # Create parent directory for this comparison
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    parent_dir = Path(benchmark_dir) / f"comparison_{timestamp}"
    parent_dir.mkdir(parents=True, exist_ok=True)
    
    # Create subdirectories
    plots_dir = parent_dir / COMPARISON_CONFIG['PLOTS_SUBDIRECTORY']
    reports_dir = parent_dir / COMPARISON_CONFIG['REPORTS_SUBDIRECTORY']
    plots_dir.mkdir(exist_ok=True)
    reports_dir.mkdir(exist_ok=True)
    
    # Extract ground truth directory if specified
    reference_dir = ground_truth_dir
    if reference_dir is None and 'parcels' in model_paths:
        reference_dir = model_paths['parcels']
        # Remove from models to evaluate since it's ground truth
        del model_paths['parcels']
    
    # Store results for each model
    model_results = {}
    
    # Process each model against the ground truth
    for model_name, base_path in model_paths.items():
        print(f"\nProcessing {model_name}...")
        base_path = Path(base_path)
        
        # Use reference_dir as the real_dir if available
        real_dir = reference_dir if reference_dir is not None else base_path
        
        try:
            # Override DIR_EVAL_CONFIG for this specific model
            if model_name == 'voronoi':
                # For Voronoi, both real and fake are jpg files
                DIR_EVAL_CONFIG['REAL_SUFFIX'] = '.jpg'
                DIR_EVAL_CONFIG['FAKE_SUFFIX'] = '.jpg'
            else:
                # For other models, real is jpg and fake is png
                DIR_EVAL_CONFIG['REAL_SUFFIX'] = '.jpg'
                DIR_EVAL_CONFIG['FAKE_SUFFIX'] = '_fake_B.png'
            
            # Use custom file matching function
            matched_pairs = find_matching_files(real_dir, base_path, 
                                             DIR_EVAL_CONFIG['REAL_SUFFIX'], 
                                             DIR_EVAL_CONFIG['FAKE_SUFFIX'])
            
            if not matched_pairs:
                raise ValueError(f"No matching files found for {model_name}")
                
            # Use our evaluator to process these pairs
            evaluator = ParcelEvaluator()
            
            # Setup benchmark
            benchmark_id = evaluator.create_benchmark_id(model_name)
            output_dir = evaluator.setup_output_directory(parent_dir / model_name, benchmark_id)
            evaluator.save_config(output_dir)
            
            # Process all image pairs
            all_results = []
            for real_file, fake_file in tqdm(matched_pairs, desc=f"Evaluating {model_name}"):
                try:
                    metrics, _ = evaluator.evaluate(
                        real_file, 
                        fake_file,
                        output_dir / "visualizations"
                    )
                    if metrics:
                        metrics['file_name'] = fake_file.name
                        all_results.append(metrics)
                except Exception as e:
                    print(f"Error processing {fake_file.name}: {str(e)}")
            
            # Create dataframe and save results
            if not all_results:
                raise ValueError(f"No successful evaluations for {model_name}")
                
            df = pd.DataFrame(all_results)
            df.to_csv(output_dir / "metrics" / "detailed_metrics.csv", index=False)
            
            # Compute and save summary
            avg_metrics = df.mean(numeric_only=True)
            summary_dict = {
                'model_name': model_name,
                'average_metrics': avg_metrics.to_dict(),
                'total_images_processed': len(all_results),
                'timestamp': datetime.now().isoformat()
            }
            
            with open(output_dir / "metrics" / "summary.json", 'w') as f:
                json.dump(summary_dict, f, indent=4)
            
            # Store in model_results for comparison
            model_results[model_name] = {
                'metrics_df': df,
                'benchmark_id': benchmark_id,
                'avg_metrics': avg_metrics,
            }
            
        except Exception as e:
            print(f"Error evaluating model {model_name}: {str(e)}")
            continue
    
    if not model_results:
        print("No models were successfully evaluated.")
        return {}, parent_dir
    
    # Create comparison visualizations
    create_comparison_plots(model_results, plots_dir)
    
    # Save comparative summary
    save_comparative_summary(model_results, parent_dir)
    
    return model_results, parent_dir

def run_custom_benchmark():
    """
    Run comprehensive benchmark comparing different models.
    """
    # Define paths to your different model outputs
    input_paths = {
        'parcels': "/home/ls/sites/re-blocking/image-generation/brooklyn_comparison/parcels", # Ground truth
        'voronoi': "/home/ls/sites/re-blocking/image-generation/brooklyn_comparison/voronoi", # Voronoi tesselation
        'ensemble': "/home/ls/sites/re-blocking/ensemble-model/ensemble-output/stable-diffusion", # Ensemble output
        'monolithic': "/home/ls/sites/re-blocking/data/results/combined-p2p-500-150-v100/test_latest/images" # Monolithic output
    }
    
    # Set output directory for benchmark results
    output_dir = "parcel_benchmark_results"
    
    # Run the evaluation with custom params
    results, benchmark_dir = evaluate_multiple_models(
        model_paths=input_paths,
        benchmark_dir=output_dir
    )
    
    print(f"\nBenchmark results saved to: {benchmark_dir}")
    return results, benchmark_dir

if __name__ == "__main__":
    # Run the custom benchmark
    results, output_path = run_custom_benchmark()
    
    # Print summary
    print("\nSummary of model performance:")
    for model_name, model_data in results.items():
        print(f"\n{model_name}:")
        for metric, value in model_data['avg_metrics'].items():
            if isinstance(value, (int, float)):
                print(f"  {metric}: {value:.4f}")

In [9]:
# ---------------------------
# Configuration Definitions
# ---------------------------
COMPARISON_CONFIG = {
    "DEFAULT_OUTPUT_DIR": "comparison_results",
    "PLOTS_SUBDIRECTORY": "plots",
    "REPORTS_SUBDIRECTORY": "reports",
    "KEY_METRICS": {
        "ssim": "SSIM",
        "mse": "MSE",
        "psnr": "PSNR (dB)",
        "mean_iou": "Mean IoU",
        "edge_smoothness": "Edge Smoothness",
        "parcel_area_ratio_diff": "Parcel Area Ratio Diff"
    },
    "BAR_PLOT_SIZE": (8, 6),
    "PLOT_GRID_STYLE": "--",
    "PLOT_GRID_ALPHA": 0.7,
    "DECIMAL_PLACES": 4,
    "MODEL_COLORS": {"voronoi": "blue", "ensemble": "green", "monolithic": "red"}
}

DIR_EVAL_CONFIG = {
    "DEFAULT_BENCHMARK_DIR": "benchmarks",
    "BATCH_SIZE": 100,
    "PARALLEL_PROCESSING": False,
    "NUM_WORKERS": 4,
    "SAVE_DETAILED_METRICS": True,
    "SAVE_SUMMARY_STATS": True,
    "REPORT_DECIMAL_PLACES": 4,
    "INCLUDE_TIMESTAMPS": True
}

PARCEL_CONFIG = {
    "MIN_AREA": 100,
    "COLOR_DIST_THRESHOLD": 50,
    "WIN_SIZE_FOR_SSIM": 7,
    "SAVE_VISUALIZATIONS": True,
    "CHUNK_SIZE": 10,
    "CONTOUR_EPSILON_FACTOR": 0.01
}

# ---------------------------
# Imports & Helper Functions
# ---------------------------
import cv2
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
from skimage.metrics import structural_similarity as ssim
import math

def find_matching_files(real_dir, fake_dir, real_suffix='.jpg', fake_suffix='.jpg'):
    """
    Find matching files from two directories.
    Dummy implementation: sorts files by name and pairs them.
    Adjust this function if your folder structure or naming is more complex.
    """
    real_files = sorted(Path(real_dir).glob(f"*{real_suffix}"))
    fake_files = sorted(Path(fake_dir).glob(f"*{fake_suffix}"))
    matched = []
    for r, f in zip(real_files, fake_files):
        matched.append((r, f))
    return matched

def process_image_worker(args):
    evaluator, real_file, fake_file, vis_dir = args
    try:
        metrics, _ = evaluator.evaluate(real_file, fake_file, vis_dir)
        return metrics
    except Exception as e:
        print(f"Error processing {fake_file.name}: {str(e)}")
        return None

def compute_mse(img1, img2):
    """Compute mean squared error between two images."""
    return np.mean((img1 - img2) ** 2)

def compute_psnr(img1, img2):
    """Compute PSNR given two images (assumes 8-bit images)."""
    mse_val = compute_mse(img1, img2)
    if mse_val == 0:
        return float('inf')
    max_i = 255.0
    return 20 * math.log10(max_i) - 10 * math.log10(mse_val)

def compute_laplacian_variance(image):
    """Compute the variance of the Laplacian (a measure of edge sharpness)."""
    lap = cv2.Laplacian(image, cv2.CV_64F)
    return lap.var()

def overlay_parcels(image, mask, color=(0, 255, 0), alpha=0.5):
    """
    Overlay a colored mask on the image.
    image: original BGR image (np.array)
    mask: binary mask (255=parcel, 0=background)
    """
    overlay = image.copy()
    overlay[mask == 255] = color
    combined = cv2.addWeighted(image, 1 - alpha, overlay, alpha, 0)
    return combined

def compute_parcel_area_ratio(mask):
    """
    Compute the ratio of pixels belonging to parcels.
    mask: binary mask with 255 for parcel, 0 for background.
    """
    return np.sum(mask == 255) / mask.size

# ---------------------------
# Comparison Plotting & Summary Functions (Cell 4)
# ---------------------------
def create_comparison_plots(model_results, output_dir):
    decimal_places = COMPARISON_CONFIG.get("DECIMAL_PLACES", 4)
    model_colors = COMPARISON_CONFIG.get("MODEL_COLORS", {})
    for metric, metric_label in COMPARISON_CONFIG['KEY_METRICS'].items():
        plt.figure(figsize=COMPARISON_CONFIG['BAR_PLOT_SIZE'])
        values = [results['avg_metrics'][metric] for results in model_results.values()]
        models = list(model_results.keys())
        bars = plt.bar(models, values, color=[model_colors.get(model, None) for model in models])
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.{decimal_places}f}',
                     ha='center', va='bottom')
        plt.grid(True, axis='y', linestyle=COMPARISON_CONFIG['PLOT_GRID_STYLE'],
                 alpha=COMPARISON_CONFIG['PLOT_GRID_ALPHA'])
        plt.tight_layout()
        plt.savefig(output_dir / f"{metric_label}_bar_plot.png")
        plt.close()

def save_comparative_summary(model_results, parent_dir):
    summary = {}
    for model_name, data in model_results.items():
        summary[model_name] = data['avg_metrics'].to_dict()
    summary_path = parent_dir / "comparative_summary.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=4)

def inspect_directory_contents(directory):
    p = Path(directory)
    files = list(p.glob("*"))
    print(f"Directory: {directory}")
    print(f"Total files matching '*': {len(files)}")
    sample_files = [f.name for f in files[:5]]
    print("Sample filenames:")
    for name in sample_files:
        print(f"  {name}")
    print("\n")

def evaluate_multiple_models(model_paths=None, benchmark_dir=COMPARISON_CONFIG['DEFAULT_OUTPUT_DIR'], ground_truth_dir=None):
    if model_paths is None:
        model_paths = {
            'parcels': "./parcels",         # Ground truth
            'voronoi': "./voronoi",
            'ensemble': "./ensemble",
            'monolithic': "./monolithic"
        }
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    parent_dir = Path(benchmark_dir) / f"comparison_{timestamp}"
    parent_dir.mkdir(parents=True, exist_ok=True)
    
    plots_dir = parent_dir / COMPARISON_CONFIG['PLOTS_SUBDIRECTORY']
    reports_dir = parent_dir / COMPARISON_CONFIG['REPORTS_SUBDIRECTORY']
    plots_dir.mkdir(exist_ok=True)
    reports_dir.mkdir(exist_ok=True)
    
    print("Inspecting directory contents:")
    for name, path in model_paths.items():
        inspect_directory_contents(path)
    if ground_truth_dir and ground_truth_dir not in model_paths.values():
        inspect_directory_contents(ground_truth_dir)
    
    reference_dir = ground_truth_dir
    if reference_dir is None and 'parcels' in model_paths:
        reference_dir = model_paths['parcels']
        model_paths_copy = model_paths.copy()
        del model_paths_copy['parcels']
        model_paths = model_paths_copy
    
    model_results = {}
    for model_name, base_path in model_paths.items():
        print(f"\nProcessing {model_name}...")
        base_path = Path(base_path)
        real_dir = reference_dir if reference_dir is not None else base_path
        
        try:
            if model_name == 'voronoi':
                real_suffix = '.jpg'
                fake_suffix = '.jpg'
            elif model_name == 'monolithic':
                real_suffix = '.jpg'
                fake_suffix = '_fake_B.png'
            else:
                real_suffix = '.jpg'
                fake_suffix = '.jpg'
            
            matched_pairs = find_matching_files(real_dir, base_path, real_suffix, fake_suffix)
            if not matched_pairs:
                raise ValueError(f"No matching files found for {model_name}")
                
            evaluator = ParcelEvaluator()
            benchmark_id = evaluator.create_benchmark_id(model_name)
            output_dir = evaluator.setup_output_directory(parent_dir / model_name, benchmark_id)
            evaluator.save_config(output_dir)
            
            all_results = []
            for real_file, fake_file in tqdm(matched_pairs, desc=f"Evaluating {model_name}"):
                try:
                    metrics, _ = evaluator.evaluate(real_file, fake_file, output_dir / "visualizations")
                    if metrics:
                        metrics['file_name'] = fake_file.name
                        all_results.append(metrics)
                except Exception as e:
                    print(f"Error processing {fake_file.name}: {str(e)}")
            
            if not all_results:
                raise ValueError(f"No successful evaluations for {model_name}")
            
            df = pd.DataFrame(all_results)
            df.to_csv(output_dir / "metrics" / "detailed_metrics.csv", index=False)
            avg_metrics = df.mean(numeric_only=True)
            summary_dict = {
                'model_name': model_name,
                'average_metrics': avg_metrics.to_dict(),
                'total_images_processed': len(all_results),
                'timestamp': datetime.now().isoformat()
            }
            with open(output_dir / "metrics" / "summary.json", 'w') as f:
                json.dump(summary_dict, f, indent=4)
            
            model_results[model_name] = {
                'metrics_df': df,
                'benchmark_id': benchmark_id,
                'avg_metrics': avg_metrics,
            }
        except Exception as e:
            print(f"Error evaluating model {model_name}: {str(e)}")
            continue
    
    if not model_results:
        print("No models were successfully evaluated.")
        return {}, parent_dir
    
    create_comparison_plots(model_results, plots_dir)
    save_comparative_summary(model_results, parent_dir)
    return model_results, parent_dir

def run_custom_benchmark():
    input_paths = {
        'parcels': "/home/ls/sites/re-blocking/image-generation/brooklyn_comparison/parcels",
        'voronoi': "/home/ls/sites/re-blocking/image-generation/brooklyn_comparison/voronoi",
        'ensemble': "/home/ls/sites/re-blocking/ensemble-model/ensemble-output/stable-diffusion",
        'monolithic': "/home/ls/sites/re-blocking/data/results/combined-p2p-500-150-v100/test_latest/images"
    }
    output_dir = "parcel_benchmark_results"
    results, benchmark_dir = evaluate_multiple_models(model_paths=input_paths, benchmark_dir=output_dir)
    print(f"\nBenchmark results saved to: {benchmark_dir}")
    return results, benchmark_dir

# ---------------------------
# Evaluation Class with Extended Metrics (Cell 5)
# ---------------------------
class ParcelEvaluator:
    def __init__(self,
                 min_area=PARCEL_CONFIG['MIN_AREA'],
                 color_dist_threshold=PARCEL_CONFIG['COLOR_DIST_THRESHOLD'],
                 win_size_for_ssim=PARCEL_CONFIG['WIN_SIZE_FOR_SSIM'],
                 save_visualizations=PARCEL_CONFIG['SAVE_VISUALIZATIONS']):
        self.min_area = min_area
        self.color_dist_threshold = color_dist_threshold
        self.win_size_for_ssim = win_size_for_ssim
        self.save_visualizations = save_visualizations
        self.chunk_size = PARCEL_CONFIG['CHUNK_SIZE']
        self.contour_epsilon_factor = PARCEL_CONFIG['CONTOUR_EPSILON_FACTOR']

    def create_benchmark_id(self, model_name=None):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        if model_name:
            return f"{model_name}_{timestamp}"
        return f"benchmark_{timestamp}"

    def setup_output_directory(self, base_dir, benchmark_id):
        output_dir = Path(base_dir) / benchmark_id
        (output_dir / "visualizations").mkdir(parents=True, exist_ok=True)
        (output_dir / "metrics").mkdir(parents=True, exist_ok=True)
        return output_dir

    def save_config(self, output_dir):
        config = {
            'min_area': self.min_area,
            'color_dist_threshold': self.color_dist_threshold,
            'win_size_for_ssim': self.win_size_for_ssim,
            'timestamp': datetime.now().isoformat()
        }
        with open(output_dir / "metrics" / "config.json", 'w') as f:
            json.dump(config, f, indent=4)

    def evaluate(self, real_file, fake_file, visualization_dir):
        """
        Evaluate a pair of images and compute metrics:
          - SSIM, MSE, PSNR
          - IoU for binary parcel masks (thresholded at 127)
          - Edge smoothness (via Laplacian variance)
          - Parcel area ratio difference (from binary masks)
          - Save SSIM map and overlay images for visual inspection.
        """
        # Load images
        real_img = cv2.imread(str(real_file))
        fake_img = cv2.imread(str(fake_file))
        if real_img is None or fake_img is None:
            raise ValueError(f"Error loading images: {real_file}, {fake_file}")
        
        # Convert to grayscale
        real_gray = cv2.cvtColor(real_img, cv2.COLOR_BGR2GRAY)
        fake_gray = cv2.cvtColor(fake_img, cv2.COLOR_BGR2GRAY)
        
        # Compute SSIM and get full SSIM map
        ssim_val, ssim_map = ssim(real_gray, fake_gray, full=True, win_size=self.win_size_for_ssim)
        
        # Compute MSE and PSNR
        mse_val = compute_mse(real_gray, fake_gray)
        psnr_val = compute_psnr(real_gray, fake_gray)
        
        # Threshold images to create binary parcel masks
        ret, real_thresh = cv2.threshold(real_gray, 127, 255, cv2.THRESH_BINARY)
        ret, fake_thresh = cv2.threshold(fake_gray, 127, 255, cv2.THRESH_BINARY)
        
        # Compute IoU for the parcel masks
        intersection = np.logical_and(real_thresh == 255, fake_thresh == 255).sum()
        union = np.logical_or(real_thresh == 255, fake_thresh == 255).sum()
        iou_val = intersection / union if union > 0 else 0
        
        # Compute edge smoothness via Laplacian variance
        lap_var_real = compute_laplacian_variance(real_gray)
        lap_var_fake = compute_laplacian_variance(fake_gray)
        edge_smoothness = (lap_var_real + lap_var_fake) / 2.0
        
        # Compute parcel area ratios and their difference
        area_ratio_real = compute_parcel_area_ratio(real_thresh)
        area_ratio_fake = compute_parcel_area_ratio(fake_thresh)
        area_ratio_diff = abs(area_ratio_real - area_ratio_fake)
        
        # Create overlays of colored parcel masks on the original images
        colored_real = overlay_parcels(real_img, real_thresh, color=(0, 255, 0), alpha=0.5)
        colored_fake = overlay_parcels(fake_img, fake_thresh, color=(0, 255, 0), alpha=0.5)
        
        # Optionally, save visualizations (SSIM map and overlays)
        if self.save_visualizations:
            vis_dir = Path(visualization_dir)
            vis_dir.mkdir(parents=True, exist_ok=True)
            ssim_vis_path = vis_dir / f"{fake_file.stem}_ssim.png"
            plt.imsave(str(ssim_vis_path), ssim_map, cmap='gray')
            cv2.imwrite(str(vis_dir / f"{fake_file.stem}_overlay_real.jpg"), colored_real)
            cv2.imwrite(str(vis_dir / f"{fake_file.stem}_overlay_fake.jpg"), colored_fake)
        
        metrics = {
            "ssim": ssim_val,
            "mse": mse_val,
            "psnr": psnr_val,
            "mean_iou": iou_val,
            "laplacian_variance_real": lap_var_real,
            "laplacian_variance_fake": lap_var_fake,
            "edge_smoothness": edge_smoothness,
            "parcel_area_ratio_real": area_ratio_real,
            "parcel_area_ratio_fake": area_ratio_fake,
            "parcel_area_ratio_diff": area_ratio_diff
        }
        return metrics, None

# ---------------------------
# Main Execution Block
# ---------------------------
if __name__ == "__main__":
    results, output_path = run_custom_benchmark()
    print("\nSummary of model performance:")
    for model_name, model_data in results.items():
        print(f"\n{model_name}:")
        for metric, value in model_data['avg_metrics'].items():
            if isinstance(value, (int, float)):
                print(f"  {metric}: {value:.{DIR_EVAL_CONFIG['REPORT_DECIMAL_PLACES']}f}")


Inspecting directory contents:
Directory: /home/ls/sites/re-blocking/image-generation/brooklyn_comparison/parcels
Total files matching '*': 1000
Sample filenames:
  parcels_000252.jpg
  parcels_000556.jpg
  parcels_000804.jpg
  parcels_000474.jpg
  parcels_000729.jpg


Directory: /home/ls/sites/re-blocking/image-generation/brooklyn_comparison/voronoi
Total files matching '*': 1000
Sample filenames:
  voronoi_000214.jpg
  voronoi_000784.jpg
  voronoi_000240.jpg
  voronoi_000672.jpg
  voronoi_000141.jpg


Directory: /home/ls/sites/re-blocking/ensemble-model/ensemble-output/stable-diffusion
Total files matching '*': 999
Sample filenames:
  ensemble_stable-diffusion_000936.jpg
  ensemble_stable-diffusion_000230.jpg
  ensemble_stable-diffusion_000149.jpg
  ensemble_stable-diffusion_000462.jpg
  ensemble_stable-diffusion_000105.jpg


Directory: /home/ls/sites/re-blocking/data/results/combined-p2p-500-150-v100/test_latest/images
Total files matching '*': 2997
Sample filenames:
  combined_0009

Evaluating voronoi: 100%|██████████| 1000/1000 [01:03<00:00, 15.79it/s]



Processing ensemble...


Evaluating ensemble:   6%|▌         | 60/999 [00:00<00:01, 591.53it/s]

Error processing ensemble_stable-diffusion_000000.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000001.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000002.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000003.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000004.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000005.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000006.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000007.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000008.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000009.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  18%|█▊        | 179/999 [00:00<00:01, 585.73it/s]

Error processing ensemble_stable-diffusion_000118.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000119.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000120.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000121.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000122.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000123.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000124.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000125.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000126.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000127.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  30%|██▉       | 297/999 [00:00<00:01, 581.70it/s]

Error processing ensemble_stable-diffusion_000237.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000238.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000239.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000240.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000241.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000242.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000243.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000244.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000245.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000246.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  36%|███▌      | 356/999 [00:00<00:01, 566.75it/s]

Error processing ensemble_stable-diffusion_000350.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000351.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000352.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000353.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000354.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000355.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000356.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000357.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000358.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000359.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  41%|████▏     | 413/999 [00:00<00:01, 559.09it/s]

Error processing ensemble_stable-diffusion_000412.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000413.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000414.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000415.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000416.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000417.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000418.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000419.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000420.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000421.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  47%|████▋     | 471/999 [00:00<00:00, 564.22it/s]

Error processing ensemble_stable-diffusion_000462.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000463.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000464.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000465.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000466.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000467.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000468.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000469.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000470.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000471.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  53%|█████▎    | 530/999 [00:00<00:00, 569.74it/s]

Error processing ensemble_stable-diffusion_000529.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000530.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000531.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000532.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000533.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000534.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000535.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000536.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000537.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000538.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  59%|█████▉    | 589/999 [00:01<00:00, 574.94it/s]

Error processing ensemble_stable-diffusion_000580.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000581.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000582.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000583.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000584.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000585.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000586.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000587.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000588.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000589.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  65%|██████▍   | 648/999 [00:01<00:00, 578.68it/s]

Error processing ensemble_stable-diffusion_000647.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000648.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000649.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000650.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000651.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000652.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000653.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000654.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000655.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000656.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  71%|███████   | 708/999 [00:01<00:00, 584.66it/s]

Error processing ensemble_stable-diffusion_000699.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000700.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000701.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000702.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000703.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000704.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000705.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000706.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000707.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000708.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  77%|███████▋  | 767/999 [00:01<00:00, 585.27it/s]

Error processing ensemble_stable-diffusion_000767.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000768.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000769.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000770.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000771.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000772.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000773.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000774.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000775.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000776.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  83%|████████▎ | 827/999 [00:01<00:00, 588.87it/s]

Error processing ensemble_stable-diffusion_000818.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000819.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000820.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000821.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000822.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000823.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000824.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000825.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000826.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000827.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  89%|████████▊ | 886/999 [00:01<00:00, 572.58it/s]

Error processing ensemble_stable-diffusion_000882.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000883.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000884.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000885.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000886.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000887.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000888.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000889.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000890.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000891.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble:  94%|█████████▍| 944/999 [00:01<00:00, 571.54it/s]

Error processing ensemble_stable-diffusion_000929.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000930.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000931.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000932.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000933.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000934.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000935.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000936.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000937.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000938.jpg: Input images must have the same dimensions.
Error proc

Evaluating ensemble: 100%|██████████| 999/999 [00:01<00:00, 576.77it/s]


Error processing ensemble_stable-diffusion_000998.jpg: Input images must have the same dimensions.
Error processing ensemble_stable-diffusion_000999.jpg: Input images must have the same dimensions.
Error evaluating model ensemble: No successful evaluations for ensemble

Processing monolithic...


Evaluating monolithic:   0%|          | 0/999 [00:00<?, ?it/s]

Error processing combined_000000_fake_B.png: Input images must have the same dimensions.
Error processing combined_000001_fake_B.png: Input images must have the same dimensions.
Error processing combined_000002_fake_B.png: Input images must have the same dimensions.
Error processing combined_000003_fake_B.png: Input images must have the same dimensions.
Error processing combined_000004_fake_B.png: Input images must have the same dimensions.
Error processing combined_000005_fake_B.png: Input images must have the same dimensions.
Error processing combined_000006_fake_B.png: Input images must have the same dimensions.
Error processing combined_000007_fake_B.png: Input images must have the same dimensions.
Error processing combined_000008_fake_B.png: Input images must have the same dimensions.
Error processing combined_000009_fake_B.png: Input images must have the same dimensions.
Error processing combined_000010_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:   4%|▍         | 41/999 [00:00<00:02, 406.43it/s]

Error processing combined_000023_fake_B.png: Input images must have the same dimensions.
Error processing combined_000024_fake_B.png: Input images must have the same dimensions.
Error processing combined_000025_fake_B.png: Input images must have the same dimensions.
Error processing combined_000026_fake_B.png: Input images must have the same dimensions.
Error processing combined_000027_fake_B.png: Input images must have the same dimensions.
Error processing combined_000028_fake_B.png: Input images must have the same dimensions.
Error processing combined_000029_fake_B.png: Input images must have the same dimensions.
Error processing combined_000030_fake_B.png: Input images must have the same dimensions.
Error processing combined_000031_fake_B.png: Input images must have the same dimensions.
Error processing combined_000032_fake_B.png: Input images must have the same dimensions.
Error processing combined_000033_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:   8%|▊         | 82/999 [00:00<00:02, 404.56it/s]

Error processing combined_000070_fake_B.png: Input images must have the same dimensions.
Error processing combined_000071_fake_B.png: Input images must have the same dimensions.
Error processing combined_000072_fake_B.png: Input images must have the same dimensions.
Error processing combined_000073_fake_B.png: Input images must have the same dimensions.
Error processing combined_000074_fake_B.png: Input images must have the same dimensions.
Error processing combined_000075_fake_B.png: Input images must have the same dimensions.
Error processing combined_000076_fake_B.png: Input images must have the same dimensions.
Error processing combined_000077_fake_B.png: Input images must have the same dimensions.
Error processing combined_000078_fake_B.png: Input images must have the same dimensions.
Error processing combined_000079_fake_B.png: Input images must have the same dimensions.
Error processing combined_000080_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  12%|█▏        | 123/999 [00:00<00:02, 402.48it/s]

Error processing combined_000104_fake_B.png: Input images must have the same dimensions.
Error processing combined_000105_fake_B.png: Input images must have the same dimensions.
Error processing combined_000106_fake_B.png: Input images must have the same dimensions.
Error processing combined_000107_fake_B.png: Input images must have the same dimensions.
Error processing combined_000108_fake_B.png: Input images must have the same dimensions.
Error processing combined_000109_fake_B.png: Input images must have the same dimensions.
Error processing combined_000110_fake_B.png: Input images must have the same dimensions.
Error processing combined_000111_fake_B.png: Input images must have the same dimensions.
Error processing combined_000112_fake_B.png: Input images must have the same dimensions.
Error processing combined_000113_fake_B.png: Input images must have the same dimensions.
Error processing combined_000114_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  16%|█▋        | 164/999 [00:00<00:02, 402.52it/s]

Error processing combined_000163_fake_B.png: Input images must have the same dimensions.
Error processing combined_000164_fake_B.png: Input images must have the same dimensions.
Error processing combined_000165_fake_B.png: Input images must have the same dimensions.
Error processing combined_000166_fake_B.png: Input images must have the same dimensions.
Error processing combined_000167_fake_B.png: Input images must have the same dimensions.
Error processing combined_000168_fake_B.png: Input images must have the same dimensions.
Error processing combined_000169_fake_B.png: Input images must have the same dimensions.
Error processing combined_000170_fake_B.png: Input images must have the same dimensions.
Error processing combined_000171_fake_B.png: Input images must have the same dimensions.
Error processing combined_000172_fake_B.png: Input images must have the same dimensions.
Error processing combined_000173_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  21%|██        | 205/999 [00:00<00:01, 403.41it/s]

Error processing combined_000186_fake_B.png: Input images must have the same dimensions.
Error processing combined_000187_fake_B.png: Input images must have the same dimensions.
Error processing combined_000188_fake_B.png: Input images must have the same dimensions.
Error processing combined_000189_fake_B.png: Input images must have the same dimensions.
Error processing combined_000190_fake_B.png: Input images must have the same dimensions.
Error processing combined_000191_fake_B.png: Input images must have the same dimensions.
Error processing combined_000192_fake_B.png: Input images must have the same dimensions.
Error processing combined_000193_fake_B.png: Input images must have the same dimensions.
Error processing combined_000194_fake_B.png: Input images must have the same dimensions.
Error processing combined_000196_fake_B.png: Input images must have the same dimensions.
Error processing combined_000197_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  25%|██▍       | 246/999 [00:00<00:01, 401.50it/s]

Error processing combined_000245_fake_B.png: Input images must have the same dimensions.
Error processing combined_000246_fake_B.png: Input images must have the same dimensions.
Error processing combined_000247_fake_B.png: Input images must have the same dimensions.
Error processing combined_000248_fake_B.png: Input images must have the same dimensions.
Error processing combined_000249_fake_B.png: Input images must have the same dimensions.
Error processing combined_000250_fake_B.png: Input images must have the same dimensions.
Error processing combined_000251_fake_B.png: Input images must have the same dimensions.
Error processing combined_000252_fake_B.png: Input images must have the same dimensions.
Error processing combined_000253_fake_B.png: Input images must have the same dimensions.
Error processing combined_000254_fake_B.png: Input images must have the same dimensions.
Error processing combined_000255_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  29%|██▊       | 287/999 [00:00<00:01, 401.26it/s]

Error processing combined_000268_fake_B.png: Input images must have the same dimensions.
Error processing combined_000269_fake_B.png: Input images must have the same dimensions.
Error processing combined_000270_fake_B.png: Input images must have the same dimensions.
Error processing combined_000271_fake_B.png: Input images must have the same dimensions.
Error processing combined_000272_fake_B.png: Input images must have the same dimensions.
Error processing combined_000273_fake_B.png: Input images must have the same dimensions.
Error processing combined_000274_fake_B.png: Input images must have the same dimensions.
Error processing combined_000275_fake_B.png: Input images must have the same dimensions.
Error processing combined_000276_fake_B.png: Input images must have the same dimensions.
Error processing combined_000277_fake_B.png: Input images must have the same dimensions.
Error processing combined_000278_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  33%|███▎      | 328/999 [00:00<00:01, 401.37it/s]

Error processing combined_000326_fake_B.png: Input images must have the same dimensions.
Error processing combined_000327_fake_B.png: Input images must have the same dimensions.
Error processing combined_000328_fake_B.png: Input images must have the same dimensions.
Error processing combined_000329_fake_B.png: Input images must have the same dimensions.
Error processing combined_000330_fake_B.png: Input images must have the same dimensions.
Error processing combined_000331_fake_B.png: Input images must have the same dimensions.
Error processing combined_000332_fake_B.png: Input images must have the same dimensions.
Error processing combined_000333_fake_B.png: Input images must have the same dimensions.
Error processing combined_000334_fake_B.png: Input images must have the same dimensions.
Error processing combined_000335_fake_B.png: Input images must have the same dimensions.
Error processing combined_000336_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  37%|███▋      | 369/999 [00:00<00:01, 399.75it/s]

Error processing combined_000348_fake_B.png: Input images must have the same dimensions.
Error processing combined_000349_fake_B.png: Input images must have the same dimensions.
Error processing combined_000350_fake_B.png: Input images must have the same dimensions.
Error processing combined_000351_fake_B.png: Input images must have the same dimensions.
Error processing combined_000352_fake_B.png: Input images must have the same dimensions.
Error processing combined_000353_fake_B.png: Input images must have the same dimensions.
Error processing combined_000354_fake_B.png: Input images must have the same dimensions.
Error processing combined_000355_fake_B.png: Input images must have the same dimensions.
Error processing combined_000356_fake_B.png: Input images must have the same dimensions.
Error processing combined_000357_fake_B.png: Input images must have the same dimensions.
Error processing combined_000358_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  41%|████      | 410/999 [00:01<00:01, 400.09it/s]

Error processing combined_000407_fake_B.png: Input images must have the same dimensions.
Error processing combined_000408_fake_B.png: Input images must have the same dimensions.
Error processing combined_000409_fake_B.png: Input images must have the same dimensions.
Error processing combined_000410_fake_B.png: Input images must have the same dimensions.
Error processing combined_000411_fake_B.png: Input images must have the same dimensions.
Error processing combined_000412_fake_B.png: Input images must have the same dimensions.
Error processing combined_000413_fake_B.png: Input images must have the same dimensions.
Error processing combined_000414_fake_B.png: Input images must have the same dimensions.
Error processing combined_000415_fake_B.png: Input images must have the same dimensions.
Error processing combined_000416_fake_B.png: Input images must have the same dimensions.
Error processing combined_000417_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  45%|████▌     | 451/999 [00:01<00:01, 399.89it/s]

Error processing combined_000429_fake_B.png: Input images must have the same dimensions.
Error processing combined_000430_fake_B.png: Input images must have the same dimensions.
Error processing combined_000431_fake_B.png: Input images must have the same dimensions.
Error processing combined_000432_fake_B.png: Input images must have the same dimensions.
Error processing combined_000433_fake_B.png: Input images must have the same dimensions.
Error processing combined_000434_fake_B.png: Input images must have the same dimensions.
Error processing combined_000435_fake_B.png: Input images must have the same dimensions.
Error processing combined_000436_fake_B.png: Input images must have the same dimensions.
Error processing combined_000437_fake_B.png: Input images must have the same dimensions.
Error processing combined_000438_fake_B.png: Input images must have the same dimensions.
Error processing combined_000439_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  49%|████▉     | 491/999 [00:01<00:01, 399.29it/s]

Error processing combined_000488_fake_B.png: Input images must have the same dimensions.
Error processing combined_000489_fake_B.png: Input images must have the same dimensions.
Error processing combined_000490_fake_B.png: Input images must have the same dimensions.
Error processing combined_000491_fake_B.png: Input images must have the same dimensions.
Error processing combined_000492_fake_B.png: Input images must have the same dimensions.
Error processing combined_000493_fake_B.png: Input images must have the same dimensions.
Error processing combined_000494_fake_B.png: Input images must have the same dimensions.
Error processing combined_000495_fake_B.png: Input images must have the same dimensions.
Error processing combined_000496_fake_B.png: Input images must have the same dimensions.
Error processing combined_000497_fake_B.png: Input images must have the same dimensions.
Error processing combined_000498_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  53%|█████▎    | 531/999 [00:01<00:01, 398.63it/s]

Error processing combined_000509_fake_B.png: Input images must have the same dimensions.
Error processing combined_000510_fake_B.png: Input images must have the same dimensions.
Error processing combined_000511_fake_B.png: Input images must have the same dimensions.
Error processing combined_000512_fake_B.png: Input images must have the same dimensions.
Error processing combined_000513_fake_B.png: Input images must have the same dimensions.
Error processing combined_000514_fake_B.png: Input images must have the same dimensions.
Error processing combined_000515_fake_B.png: Input images must have the same dimensions.
Error processing combined_000516_fake_B.png: Input images must have the same dimensions.
Error processing combined_000517_fake_B.png: Input images must have the same dimensions.
Error processing combined_000518_fake_B.png: Input images must have the same dimensions.
Error processing combined_000519_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  57%|█████▋    | 572/999 [00:01<00:01, 399.20it/s]

Error processing combined_000569_fake_B.png: Input images must have the same dimensions.
Error processing combined_000570_fake_B.png: Input images must have the same dimensions.
Error processing combined_000571_fake_B.png: Input images must have the same dimensions.
Error processing combined_000572_fake_B.png: Input images must have the same dimensions.
Error processing combined_000573_fake_B.png: Input images must have the same dimensions.
Error processing combined_000574_fake_B.png: Input images must have the same dimensions.
Error processing combined_000575_fake_B.png: Input images must have the same dimensions.
Error processing combined_000576_fake_B.png: Input images must have the same dimensions.
Error processing combined_000577_fake_B.png: Input images must have the same dimensions.
Error processing combined_000578_fake_B.png: Input images must have the same dimensions.
Error processing combined_000579_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  61%|██████▏   | 613/999 [00:01<00:00, 400.19it/s]

Error processing combined_000590_fake_B.png: Input images must have the same dimensions.
Error processing combined_000591_fake_B.png: Input images must have the same dimensions.
Error processing combined_000592_fake_B.png: Input images must have the same dimensions.
Error processing combined_000593_fake_B.png: Input images must have the same dimensions.
Error processing combined_000594_fake_B.png: Input images must have the same dimensions.
Error processing combined_000595_fake_B.png: Input images must have the same dimensions.
Error processing combined_000596_fake_B.png: Input images must have the same dimensions.
Error processing combined_000597_fake_B.png: Input images must have the same dimensions.
Error processing combined_000598_fake_B.png: Input images must have the same dimensions.
Error processing combined_000599_fake_B.png: Input images must have the same dimensions.
Error processing combined_000600_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  65%|██████▌   | 654/999 [00:01<00:00, 399.83it/s]

Error processing combined_000650_fake_B.png: Input images must have the same dimensions.
Error processing combined_000651_fake_B.png: Input images must have the same dimensions.
Error processing combined_000652_fake_B.png: Input images must have the same dimensions.
Error processing combined_000653_fake_B.png: Input images must have the same dimensions.
Error processing combined_000654_fake_B.png: Input images must have the same dimensions.
Error processing combined_000655_fake_B.png: Input images must have the same dimensions.
Error processing combined_000656_fake_B.png: Input images must have the same dimensions.
Error processing combined_000657_fake_B.png: Input images must have the same dimensions.
Error processing combined_000658_fake_B.png: Input images must have the same dimensions.
Error processing combined_000659_fake_B.png: Input images must have the same dimensions.
Error processing combined_000660_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  70%|██████▉   | 695/999 [00:01<00:00, 400.06it/s]

Error processing combined_000671_fake_B.png: Input images must have the same dimensions.
Error processing combined_000672_fake_B.png: Input images must have the same dimensions.
Error processing combined_000673_fake_B.png: Input images must have the same dimensions.
Error processing combined_000674_fake_B.png: Input images must have the same dimensions.
Error processing combined_000675_fake_B.png: Input images must have the same dimensions.
Error processing combined_000676_fake_B.png: Input images must have the same dimensions.
Error processing combined_000677_fake_B.png: Input images must have the same dimensions.
Error processing combined_000678_fake_B.png: Input images must have the same dimensions.
Error processing combined_000679_fake_B.png: Input images must have the same dimensions.
Error processing combined_000680_fake_B.png: Input images must have the same dimensions.
Error processing combined_000681_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  74%|███████▎  | 736/999 [00:01<00:00, 386.73it/s]

Error processing combined_000727_fake_B.png: Input images must have the same dimensions.
Error processing combined_000728_fake_B.png: Input images must have the same dimensions.
Error processing combined_000729_fake_B.png: Input images must have the same dimensions.
Error processing combined_000730_fake_B.png: Input images must have the same dimensions.
Error processing combined_000731_fake_B.png: Input images must have the same dimensions.
Error processing combined_000732_fake_B.png: Input images must have the same dimensions.
Error processing combined_000733_fake_B.png: Input images must have the same dimensions.
Error processing combined_000734_fake_B.png: Input images must have the same dimensions.
Error processing combined_000735_fake_B.png: Input images must have the same dimensions.
Error processing combined_000736_fake_B.png: Input images must have the same dimensions.
Error processing combined_000737_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  78%|███████▊  | 776/999 [00:01<00:00, 388.73it/s]

Error processing combined_000747_fake_B.png: Input images must have the same dimensions.
Error processing combined_000748_fake_B.png: Input images must have the same dimensions.
Error processing combined_000749_fake_B.png: Input images must have the same dimensions.
Error processing combined_000750_fake_B.png: Input images must have the same dimensions.
Error processing combined_000751_fake_B.png: Input images must have the same dimensions.
Error processing combined_000752_fake_B.png: Input images must have the same dimensions.
Error processing combined_000753_fake_B.png: Input images must have the same dimensions.
Error processing combined_000754_fake_B.png: Input images must have the same dimensions.
Error processing combined_000755_fake_B.png: Input images must have the same dimensions.
Error processing combined_000756_fake_B.png: Input images must have the same dimensions.
Error processing combined_000757_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  82%|████████▏ | 816/999 [00:02<00:00, 391.69it/s]

Error processing combined_000807_fake_B.png: Input images must have the same dimensions.
Error processing combined_000808_fake_B.png: Input images must have the same dimensions.
Error processing combined_000809_fake_B.png: Input images must have the same dimensions.
Error processing combined_000810_fake_B.png: Input images must have the same dimensions.
Error processing combined_000811_fake_B.png: Input images must have the same dimensions.
Error processing combined_000812_fake_B.png: Input images must have the same dimensions.
Error processing combined_000813_fake_B.png: Input images must have the same dimensions.
Error processing combined_000814_fake_B.png: Input images must have the same dimensions.
Error processing combined_000815_fake_B.png: Input images must have the same dimensions.
Error processing combined_000816_fake_B.png: Input images must have the same dimensions.
Error processing combined_000817_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  86%|████████▌ | 857/999 [00:02<00:00, 394.77it/s]

Error processing combined_000828_fake_B.png: Input images must have the same dimensions.
Error processing combined_000829_fake_B.png: Input images must have the same dimensions.
Error processing combined_000830_fake_B.png: Input images must have the same dimensions.
Error processing combined_000831_fake_B.png: Input images must have the same dimensions.
Error processing combined_000832_fake_B.png: Input images must have the same dimensions.
Error processing combined_000833_fake_B.png: Input images must have the same dimensions.
Error processing combined_000834_fake_B.png: Input images must have the same dimensions.
Error processing combined_000835_fake_B.png: Input images must have the same dimensions.
Error processing combined_000836_fake_B.png: Input images must have the same dimensions.
Error processing combined_000837_fake_B.png: Input images must have the same dimensions.
Error processing combined_000838_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  90%|████████▉ | 897/999 [00:02<00:00, 395.51it/s]

Error processing combined_000888_fake_B.png: Input images must have the same dimensions.
Error processing combined_000889_fake_B.png: Input images must have the same dimensions.
Error processing combined_000890_fake_B.png: Input images must have the same dimensions.
Error processing combined_000891_fake_B.png: Input images must have the same dimensions.
Error processing combined_000892_fake_B.png: Input images must have the same dimensions.
Error processing combined_000893_fake_B.png: Input images must have the same dimensions.
Error processing combined_000894_fake_B.png: Input images must have the same dimensions.
Error processing combined_000895_fake_B.png: Input images must have the same dimensions.
Error processing combined_000896_fake_B.png: Input images must have the same dimensions.
Error processing combined_000897_fake_B.png: Input images must have the same dimensions.
Error processing combined_000898_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  94%|█████████▍| 937/999 [00:02<00:00, 396.52it/s]

Error processing combined_000908_fake_B.png: Input images must have the same dimensions.
Error processing combined_000909_fake_B.png: Input images must have the same dimensions.
Error processing combined_000910_fake_B.png: Input images must have the same dimensions.
Error processing combined_000911_fake_B.png: Input images must have the same dimensions.
Error processing combined_000912_fake_B.png: Input images must have the same dimensions.
Error processing combined_000913_fake_B.png: Input images must have the same dimensions.
Error processing combined_000914_fake_B.png: Input images must have the same dimensions.
Error processing combined_000915_fake_B.png: Input images must have the same dimensions.
Error processing combined_000916_fake_B.png: Input images must have the same dimensions.
Error processing combined_000917_fake_B.png: Input images must have the same dimensions.
Error processing combined_000918_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic:  98%|█████████▊| 977/999 [00:02<00:00, 396.96it/s]

Error processing combined_000968_fake_B.png: Input images must have the same dimensions.
Error processing combined_000969_fake_B.png: Input images must have the same dimensions.
Error processing combined_000970_fake_B.png: Input images must have the same dimensions.
Error processing combined_000971_fake_B.png: Input images must have the same dimensions.
Error processing combined_000972_fake_B.png: Input images must have the same dimensions.
Error processing combined_000973_fake_B.png: Input images must have the same dimensions.
Error processing combined_000974_fake_B.png: Input images must have the same dimensions.
Error processing combined_000975_fake_B.png: Input images must have the same dimensions.
Error processing combined_000976_fake_B.png: Input images must have the same dimensions.
Error processing combined_000977_fake_B.png: Input images must have the same dimensions.
Error processing combined_000978_fake_B.png: Input images must have the same dimensions.
Error processing comb

Evaluating monolithic: 100%|██████████| 999/999 [00:02<00:00, 398.07it/s]


Error processing combined_000989_fake_B.png: Input images must have the same dimensions.
Error processing combined_000990_fake_B.png: Input images must have the same dimensions.
Error processing combined_000991_fake_B.png: Input images must have the same dimensions.
Error processing combined_000992_fake_B.png: Input images must have the same dimensions.
Error processing combined_000993_fake_B.png: Input images must have the same dimensions.
Error processing combined_000994_fake_B.png: Input images must have the same dimensions.
Error processing combined_000995_fake_B.png: Input images must have the same dimensions.
Error processing combined_000996_fake_B.png: Input images must have the same dimensions.
Error processing combined_000997_fake_B.png: Input images must have the same dimensions.
Error processing combined_000998_fake_B.png: Input images must have the same dimensions.
Error processing combined_000999_fake_B.png: Input images must have the same dimensions.
Error evaluating mode