# Processing Overlay for Custom Comparison Operations

This notebook implements a loading/processing overlay system for custom comparison operations, allowing users to visualize progress during potentially time-consuming comparison tasks.

## Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm
import time
import random
from typing import List, Dict, Any, Callable, Union, Tuple
import concurrent.futures

## Set Up Data Loading Functions

These functions provide visual feedback during data loading operations.

In [None]:
def load_data_with_progress(data_source, total_items=100, simulate_delay=True):
    """
    Load data with a progress bar visualization
    
    Parameters:
    -----------
    data_source : str or pd.DataFrame
        Source of data - can be a filepath or a DataFrame
    total_items : int
        Number of items to process (for simulation purposes)
    simulate_delay : bool
        Whether to simulate loading delay for demonstration
        
    Returns:
    --------
    pd.DataFrame
        The loaded data
    """
    # Create a progress bar
    progress_bar = tqdm(total=total_items)
    
    # Placeholder for the data
    data = None
    
    # If data_source is a string, assume it's a filepath
    if isinstance(data_source, str):
        # Simulate loading from file
        for i in range(total_items):
            if simulate_delay:
                time.sleep(random.uniform(0.01, 0.05))  # Simulate variable loading time
            progress_bar.update(1)
            progress_bar.set_description(f"Loading item {i+1}/{total_items}")
        
        # Generate sample data if we're just simulating
        data = pd.DataFrame({
            'id': range(1, total_items+1),
            'value_a': np.random.rand(total_items) * 100,
            'value_b': np.random.rand(total_items) * 100,
            'category': np.random.choice(['A', 'B', 'C', 'D'], total_items)
        })
    
    # If data_source is already a DataFrame
    elif isinstance(data_source, pd.DataFrame):
        data = data_source
        total_items = len(data)
        
        # Simulate processing the existing DataFrame
        for i in range(total_items):
            if simulate_delay:
                time.sleep(random.uniform(0.01, 0.03))
            progress_bar.update(1)
            progress_bar.set_description(f"Processing item {i+1}/{total_items}")
    
    progress_bar.close()
    print(f"✅ Data loading complete - {total_items} items processed")
    
    return data

In [None]:
def load_multiple_datasets(sources, names=None):
    """
    Load multiple datasets with progress tracking
    
    Parameters:
    -----------
    sources : list
        List of data sources
    names : list, optional
        Names for each dataset
        
    Returns:
    --------
    dict
        Dictionary of loaded datasets
    """
    if names is None:
        names = [f"dataset_{i}" for i in range(len(sources))]
    
    datasets = {}
    
    for i, (source, name) in enumerate(zip(sources, names)):
        print(f"Loading dataset {i+1}/{len(sources)}: {name}")
        datasets[name] = load_data_with_progress(
            source, 
            total_items=random.randint(50, 150),  # Simulate different dataset sizes
            simulate_delay=True
        )
        print(f"Dataset '{name}' loaded with {len(datasets[name])} records\n")
    
    return datasets

## Create Processing Overlay

Implement visual overlay components to display during processing operations.

In [None]:
class ProcessingOverlay:
    """A class to manage processing overlays with various visual indicators"""
    
    def __init__(self, title="Processing..."):
        """Initialize the overlay with default components"""
        self.title = title
        self.progress_bar = None
        self.status_text = None
        self.output_area = None
        self.is_active = False
    
    def create_spinner_overlay(self):
        """Create a spinner-style overlay"""
        # Create output area for the overlay
        self.output_area = widgets.Output()
        
        # Create status text widget
        self.status_text = widgets.HTML(
            value=f"<h3>{self.title}</h3><p>Operation in progress...</p>"
        )
        
        # Create a layout for centering content
        center_layout = widgets.Layout(
            display='flex',
            flex_flow='column',
            align_items='center',
            justify_content='center',
            width='100%',
            height='100%'
        )
        
        # Create a container with the centered layout
        container = widgets.VBox([
            self.status_text,
            widgets.HTML(value='<div class="spinner-border text-primary" role="status"></div>'),
        ], layout=center_layout)
        
        # Display the container in the output area
        with self.output_area:
            display(container)
        
        return self.output_area
    
    def create_progress_overlay(self, total=100):
        """Create a progress bar overlay"""
        # Create output area for the overlay
        self.output_area = widgets.Output()
        
        # Create status text widget
        self.status_text = widgets.HTML(
            value=f"<h3>{self.title}</h3><p>Operation in progress...</p>"
        )
        
        # Create progress bar
        self.progress_bar = widgets.IntProgress(
            value=0,
            min=0,
            max=total,
            description='Processing:',
            bar_style='info',
            orientation='horizontal'
        )
        
        # Create a layout for centering content
        center_layout = widgets.Layout(
            display='flex',
            flex_flow='column',
            align_items='center',
            justify_content='center',
            width='100%',
            margin='10px 0px'
        )
        
        # Create a container with the centered layout
        container = widgets.VBox([
            self.status_text,
            self.progress_bar
        ], layout=center_layout)
        
        # Display the container in the output area
        with self.output_area:
            display(container)
        
        return self.output_area
    
    def show(self, overlay_type='progress', total=100):
        """Show the overlay"""
        if overlay_type == 'progress':
            display(self.create_progress_overlay(total))
        else:  # spinner
            display(self.create_spinner_overlay())
        self.is_active = True
    
    def update_progress(self, value, message=None):
        """Update the progress bar value and optionally the message"""
        if self.progress_bar is not None:
            self.progress_bar.value = value
        
        if message is not None and self.status_text is not None:
            self.status_text.value = f"<h3>{self.title}</h3><p>{message}</p>"
    
    def update_status(self, message):
        """Update just the status message"""
        if self.status_text is not None:
            self.status_text.value = f"<h3>{self.title}</h3><p>{message}</p>"
    
    def hide(self):
        """Hide the overlay"""
        if self.output_area is not None:
            self.output_area.clear_output()
        self.is_active = False

# Example of usage
def demo_processing_overlay():
    overlay = ProcessingOverlay("Data Processing Demo")
    overlay.show(overlay_type='progress', total=100)
    
    for i in range(101):
        time.sleep(0.05)  # Simulate processing time
        overlay.update_progress(i, f"Processing item {i}/100")
    
    overlay.update_status("Processing complete!")
    time.sleep(1)
    overlay.hide()
    print("Processing completed successfully!")

## Implement Custom Comparison Logic

Here we develop the core comparison functionality that will be enhanced with the processing overlay.

In [None]:
class DataComparator:
    """Class to handle different types of data comparisons with visual feedback"""
    
    def __init__(self, datasets=None):
        """Initialize with optional datasets"""
        self.datasets = datasets or {}
        self.comparison_results = {}
        self.overlay = ProcessingOverlay("Comparison in Progress")
    
    def add_dataset(self, name, data):
        """Add a dataset to the comparator"""
        self.datasets[name] = data
        print(f"Added dataset '{name}' with {len(data)} rows")
    
    def column_distribution_comparison(self, dataset1_name, dataset2_name, column, bins=10):
        """
        Compare the distribution of values in a specific column between two datasets
        
        Parameters:
        -----------
        dataset1_name, dataset2_name : str
            Names of datasets to compare
        column : str
            Column to compare
        bins : int
            Number of bins for histogram comparison
            
        Returns:
        --------
        dict
            Comparison results
        """
        # Get the datasets
        dataset1 = self.datasets.get(dataset1_name)
        dataset2 = self.datasets.get(dataset2_name)
        
        if dataset1 is None or dataset2 is None:
            raise ValueError("One or both datasets not found")
        
        if column not in dataset1.columns or column not in dataset2.columns:
            raise ValueError(f"Column '{column}' not found in one or both datasets")
        
        # Show processing overlay
        self.overlay.show(overlay_type='progress', total=100)
        self.overlay.update_status(f"Comparing distributions for column '{column}'")
        
        # Extract column data
        data1 = dataset1[column].values
        data2 = dataset2[column].values
        
        # Simulate processing time
        time.sleep(0.5)
        self.overlay.update_progress(20, "Extracting column data...")
        
        # Calculate histograms
        time.sleep(0.5)
        self.overlay.update_progress(40, "Calculating histograms...")
        
        hist1, edges1 = np.histogram(data1, bins=bins)
        hist2, edges2 = np.histogram(data2, bins=bins)
        
        # Normalize histograms for comparison
        time.sleep(0.5)
        self.overlay.update_progress(60, "Normalizing distributions...")
        
        hist1_norm = hist1 / hist1.sum() if hist1.sum() > 0 else hist1
        hist2_norm = hist2 / hist2.sum() if hist2.sum() > 0 else hist2
        
        # Calculate difference metrics
        time.sleep(0.5)
        self.overlay.update_progress(80, "Calculating difference metrics...")
        
        # Absolute difference
        abs_diff = np.abs(hist1_norm - hist2_norm).sum()
        
        # Jensen-Shannon divergence (a symmetric measure of similarity between distributions)
        def kl_divergence(p, q):
            # Add small epsilon to avoid division by zero
            p = p + 1e-10
            q = q + 1e-10
            return np.sum(p * np.log(p / q))
        
        # Calculate the average distribution
        m = (hist1_norm + hist2_norm) / 2
        
        # Calculate JS divergence
        js_divergence = (kl_divergence(hist1_norm, m) + kl_divergence(hist2_norm, m)) / 2
        
        # Prepare results
        time.sleep(0.5)
        self.overlay.update_progress(100, "Preparing comparison results...")
        
        results = {
            'dataset1': dataset1_name,
            'dataset2': dataset2_name,
            'column': column,
            'bins': bins,
            'hist1': hist1,
            'hist2': hist2,
            'edges': edges1,  # We use edges1 since they should be the same for both histograms
            'abs_difference': abs_diff,
            'js_divergence': js_divergence,
            'similarity_score': 1 - abs_diff  # Simple similarity score based on absolute difference
        }
        
        # Store results
        comparison_id = f"{dataset1_name}_vs_{dataset2_name}_{column}"
        self.comparison_results[comparison_id] = results
        
        # Hide overlay
        time.sleep(0.5)
        self.overlay.hide()
        
        return results
    
    def value_counts_comparison(self, dataset1_name, dataset2_name, column):
        """
        Compare value counts (categories) between two datasets
        
        Parameters:
        -----------
        dataset1_name, dataset2_name : str
            Names of datasets to compare
        column : str
            Column to compare
            
        Returns:
        --------
        dict
            Comparison results
        """
        # Get the datasets
        dataset1 = self.datasets.get(dataset1_name)
        dataset2 = self.datasets.get(dataset2_name)
        
        if dataset1 is None or dataset2 is None:
            raise ValueError("One or both datasets not found")
        
        if column not in dataset1.columns or column not in dataset2.columns:
            raise ValueError(f"Column '{column}' not found in one or both datasets")
        
        # Show processing overlay
        self.overlay.show(overlay_type='spinner')
        self.overlay.update_status(f"Comparing value counts for column '{column}'")
        
        # Get value counts
        time.sleep(0.5)
        counts1 = dataset1[column].value_counts().to_dict()
        counts2 = dataset2[column].value_counts().to_dict()
        
        # Get all unique values from both datasets
        all_values = set(counts1.keys()) | set(counts2.keys())
        
        # Create a comparison table
        comparison = {}
        for value in all_values:
            count1 = counts1.get(value, 0)
            count2 = counts2.get(value, 0)
            diff = count2 - count1
            pct_diff = (diff / count1 * 100) if count1 > 0 else float('inf')
            
            comparison[value] = {
                'count_' + dataset1_name: count1,
                'count_' + dataset2_name: count2,
                'difference': diff,
                'percent_difference': pct_diff
            }
        
        # Prepare results
        time.sleep(0.5)
        results = {
            'dataset1': dataset1_name,
            'dataset2': dataset2_name,
            'column': column,
            'comparison_table': comparison,
            'unique_values_dataset1': len(counts1),
            'unique_values_dataset2': len(counts2),
            'unique_values_both': len(all_values)
        }
        
        # Store results
        comparison_id = f"{dataset1_name}_vs_{dataset2_name}_{column}_counts"
        self.comparison_results[comparison_id] = results
        
        # Hide overlay
        time.sleep(0.5)
        self.overlay.hide()
        
        return results
    
    def batch_column_comparison(self, dataset1_name, dataset2_name, columns=None):
        """
        Compare multiple columns between two datasets
        
        Parameters:
        -----------
        dataset1_name, dataset2_name : str
            Names of datasets to compare
        columns : list, optional
            List of columns to compare. If None, compares all common columns.
            
        Returns:
        --------
        dict
            Dictionary of comparison results for each column
        """
        # Get the datasets
        dataset1 = self.datasets.get(dataset1_name)
        dataset2 = self.datasets.get(dataset2_name)
        
        if dataset1 is None or dataset2 is None:
            raise ValueError("One or both datasets not found")
        
        # Determine which columns to compare
        if columns is None:
            # Find common columns
            columns = list(set(dataset1.columns) & set(dataset2.columns))
        else:
            # Verify all requested columns exist
            for col in columns:
                if col not in dataset1.columns or col not in dataset2.columns:
                    raise ValueError(f"Column '{col}' not found in one or both datasets")
        
        # Show processing overlay with overall progress
        self.overlay.show(overlay_type='progress', total=len(columns))
        self.overlay.update_status(f"Comparing {len(columns)} columns between datasets")
        
        # Store batch results
        batch_results = {}
        
        # Perform comparison for each column
        for i, column in enumerate(columns):
            self.overlay.update_progress(i, f"Comparing column {i+1}/{len(columns)}: '{column}'")
            
            # Determine the appropriate comparison method based on data type
            if dataset1[column].dtype in [np.int64, np.float64] and dataset2[column].dtype in [np.int64, np.float64]:
                # Numeric column - use distribution comparison
                try:
                    # Don't show the column overlay since we have a batch overlay
                    result = self.column_distribution_comparison(dataset1_name, dataset2_name, column)
                    batch_results[column] = {
                        'type': 'distribution',
                        'result': result
                    }
                except Exception as e:
                    batch_results[column] = {
                        'type': 'error',
                        'error': str(e)
                    }
            else:
                # Categorical column - use value counts comparison
                try:
                    # Don't show the column overlay since we have a batch overlay
                    result = self.value_counts_comparison(dataset1_name, dataset2_name, column)
                    batch_results[column] = {
                        'type': 'value_counts',
                        'result': result
                    }
                except Exception as e:
                    batch_results[column] = {
                        'type': 'error',
                        'error': str(e)
                    }
        
        # Complete the progress bar
        self.overlay.update_progress(len(columns), "Comparison complete!")
        time.sleep(0.5)
        self.overlay.hide()
        
        return batch_results

## Apply Processing Overlay to Comparisons

Now we'll integrate the processing overlay with the comparison operations to provide visual feedback during execution.

In [None]:
# Generate sample datasets for demonstration
def generate_sample_datasets():
    print("Generating sample datasets for comparison...")
    
    # Dataset 1: Base dataset
    df1 = pd.DataFrame({
        'id': range(1, 1001),
        'numeric_value': np.random.normal(50, 15, 1000),
        'category': np.random.choice(['A', 'B', 'C', 'D'], 1000, p=[0.4, 0.3, 0.2, 0.1]),
        'binary_flag': np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
    })
    
    # Dataset 2: Similar to df1 but with some differences
    df2 = pd.DataFrame({
        'id': range(1, 1001),
        'numeric_value': np.random.normal(55, 17, 1000),  # Slightly different distribution
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], 1000, p=[0.35, 0.25, 0.2, 0.1, 0.1]),  # Added a new category
        'binary_flag': np.random.choice([0, 1], 1000, p=[0.6, 0.4]),  # Different proportion
    })
    
    print("Sample datasets generated!")
    return df1, df2

# Demonstrate the use of the comparator with overlay
def run_comparison_demo():
    # Generate sample datasets
    df1, df2 = generate_sample_datasets()
    
    # Create comparator
    comparator = DataComparator()
    
    # Add datasets with loading progress
    print("Loading datasets into comparator...")
    overlay = ProcessingOverlay("Dataset Loading")
    overlay.show(overlay_type='progress', total=2)
    
    overlay.update_progress(0, "Loading dataset 1...")
    time.sleep(1)  # Simulate loading time
    comparator.add_dataset("original", df1)
    
    overlay.update_progress(1, "Loading dataset 2...")
    time.sleep(1)  # Simulate loading time
    comparator.add_dataset("modified", df2)
    
    overlay.update_progress(2, "Datasets loaded successfully!")
    time.sleep(0.5)
    overlay.hide()
    
    print("\n1. Running single column distribution comparison...")
    dist_results = comparator.column_distribution_comparison("original", "modified", "numeric_value", bins=15)
    
    print("\n2. Running value counts comparison...")
    counts_results = comparator.value_counts_comparison("original", "modified", "category")
    
    print("\n3. Running batch comparison of all columns...")
    batch_results = comparator.batch_column_comparison("original", "modified")
    
    return comparator

## Visualize Results

Create visualizations of the comparison results after processing is complete.

# Custom Comparison with Processing Overlay

This notebook demonstrates how to implement a custom comparison operation with a loading/processing overlay to indicate when long-running operations are in progress.

## Import Required Libraries

Import necessary libraries for data processing, visualization, and UI components.

In [None]:
# Standard libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IPython and Jupyter widgets for the overlay
from IPython.display import display, clear_output
import ipywidgets as widgets

# Progress bar
from tqdm.notebook import tqdm

## Load Data

Load the dataset that will be used for comparison analysis.

In [None]:
# For this example, we'll create sample data
# In a real scenario, you would load data from a file or API

# Create sample dataset 1
data1 = pd.DataFrame({
    'ID': range(1, 1001),
    'Value': np.random.normal(100, 15, 1000),
    'Category': np.random.choice(['A', 'B', 'C', 'D'], 1000),
    'Date': pd.date_range(start='2023-01-01', periods=1000),
    'Status': np.random.choice(['Active', 'Inactive', 'Pending'], 1000)
})

# Create sample dataset 2 (with some differences)
data2 = pd.DataFrame({
    'ID': range(1, 1001),
    'Value': np.random.normal(105, 17, 1000),  # Slightly different distribution
    'Category': np.random.choice(['A', 'B', 'C', 'D'], 1000),
    'Date': pd.date_range(start='2023-01-01', periods=1000),
    'Status': np.random.choice(['Active', 'Inactive', 'Pending', 'New'], 1000)  # Added 'New' status
})

# Display sample of each dataset
print("Dataset 1 Sample:")
display(data1.head())

print("\nDataset 2 Sample:")
display(data2.head())

## Basic Data Processing

Prepare and clean the data for the comparison operation.

In [None]:
# Function to preprocess datasets before comparison
def preprocess_data(df, name):
    """
    Perform basic preprocessing on the dataframe
    - Handle missing values
    - Standardize column types
    - Create a copy to avoid modifying the original
    """
    processed_df = df.copy()
    
    # Fill any missing values
    processed_df.fillna({
        'Value': processed_df['Value'].mean(),
        'Category': 'Unknown',
        'Status': 'Unknown'
    }, inplace=True)
    
    # Ensure consistent data types
    processed_df['ID'] = processed_df['ID'].astype(int)
    processed_df['Value'] = processed_df['Value'].astype(float)
    processed_df['Date'] = pd.to_datetime(processed_df['Date'])
    
    # Add source indicator
    processed_df['Source'] = name
    
    print(f"Preprocessing complete for {name}")
    return processed_df

# Preprocess both datasets
data1_processed = preprocess_data(data1, "Dataset 1")
data2_processed = preprocess_data(data2, "Dataset 2")

# Display summaries
print("\nDataset 1 Summary:")
display(data1_processed.describe())

print("\nDataset 2 Summary:")
display(data2_processed.describe())

## Custom Comparison with Processing Overlay

Implement a custom comparison function with a loading/processing overlay to indicate when long-running operations are in progress.

In [None]:
# Create a loading overlay widget
def create_loading_widget(message="Processing..."):
    """Create a loading widget with a message and spinner"""
    spinner = widgets.HTML(
        value='<i class="fa fa-spinner fa-spin" style="font-size:24px;color:blue"></i>',
        layout=widgets.Layout(margin='0 10px 0 0')
    )
    text = widgets.HTML(value=f"<h3>{message}</h3>")
    return widgets.HBox([spinner, text])

# Function to perform comparison with an overlay
def compare_datasets_with_overlay(df1, df2, id_column='ID'):
    """
    Compare two datasets with a loading overlay
    Returns a comparison result dataframe
    """
    # Create and display loading widget
    loading_widget = create_loading_widget("Comparing datasets. Please wait...")
    display(loading_widget)
    
    # Initialize empty comparison results
    comparison_results = pd.DataFrame()
    
    try:
        # Simulate a long-running process
        for step in tqdm(range(5), desc="Comparison Progress"):
            if step == 0:
                # Check for matching IDs
                ids_in_df1_not_df2 = set(df1[id_column]) - set(df2[id_column])
                ids_in_df2_not_df1 = set(df2[id_column]) - set(df1[id_column])
                common_ids = set(df1[id_column]).intersection(set(df2[id_column]))
                
                print(f"IDs in Dataset 1 but not in Dataset 2: {len(ids_in_df1_not_df2)}")
                print(f"IDs in Dataset 2 but not in Dataset 1: {len(ids_in_df2_not_df1)}")
                print(f"Common IDs: {len(common_ids)}")
                
                time.sleep(1)  # Simulate processing time
                
            elif step == 1:
                # Compare numerical differences for common IDs
                # Filter both dataframes to include only common IDs
                df1_common = df1[df1[id_column].isin(common_ids)]
                df2_common = df2[df2[id_column].isin(common_ids)]
                
                # Analyze numerical columns
                num_cols = df1_common.select_dtypes(include=['number']).columns
                num_cols = [col for col in num_cols if col != id_column]
                
                for col in num_cols:
                    if col in df2_common.columns:
                        df1_common = df1_common.set_index(id_column)
                        df2_common = df2_common.set_index(id_column)
                        
                        # Calculate differences
                        diff = df1_common[col] - df2_common[col]
                        pct_diff = ((df1_common[col] - df2_common[col]) / df2_common[col]) * 100
                        
                        # Add to comparison results
                        if comparison_results.empty:
                            comparison_results = pd.DataFrame(index=diff.index)
                        
                        comparison_results[f"{col}_diff"] = diff
                        comparison_results[f"{col}_pct_diff"] = pct_diff
                
                time.sleep(1)  # Simulate processing time
                
            elif step == 2:
                # Compare categorical columns
                cat_cols = df1.select_dtypes(include=['object']).columns
                cat_cols = [col for col in cat_cols if col != id_column and col != 'Source']
                
                for col in cat_cols:
                    if col in df2.columns:
                        df1_common = df1[df1[id_column].isin(common_ids)].set_index(id_column)
                        df2_common = df2[df2[id_column].isin(common_ids)].set_index(id_column)
                        
                        # Flag mismatched values
                        match_status = (df1_common[col] == df2_common[col])
                        
                        # Add to comparison results
                        if comparison_results.empty:
                            comparison_results = pd.DataFrame(index=match_status.index)
                        
                        comparison_results[f"{col}_match"] = match_status
                        
                        # Add the actual values for comparison
                        comparison_results[f"{col}_df1"] = df1_common[col]
                        comparison_results[f"{col}_df2"] = df2_common[col]
                
                time.sleep(1)  # Simulate processing time
                
            elif step == 3:
                # Compare distributions
                num_cols = df1.select_dtypes(include=['number']).columns
                num_cols = [col for col in num_cols if col != id_column]
                
                distribution_summary = {}
                
                for col in num_cols:
                    if col in df2.columns:
                        # Basic statistics
                        stats_df1 = df1[col].describe()
                        stats_df2 = df2[col].describe()
                        
                        # Store statistics in the summary
                        distribution_summary[col] = {
                            'mean_diff': stats_df1['mean'] - stats_df2['mean'],
                            'std_diff': stats_df1['std'] - stats_df2['std'],
                            'min_df1': stats_df1['min'],
                            'min_df2': stats_df2['min'],
                            'max_df1': stats_df1['max'],
                            'max_df2': stats_df2['max'],
                            'distribution_shift': abs(stats_df1['mean'] - stats_df2['mean']) / stats_df1['std']
                        }
                
                distribution_df = pd.DataFrame(distribution_summary).T
                
                time.sleep(1)  # Simulate processing time
                
            elif step == 4:
                # Finalize and add summary stats
                if not comparison_results.empty:
                    # Reset index to make ID a column again
                    comparison_results = comparison_results.reset_index().rename(columns={'index': id_column})
                    
                    # Count total differences
                    match_columns = [col for col in comparison_results.columns if col.endswith('_match')]
                    if match_columns:
                        comparison_results['total_mismatches'] = len(match_columns) - comparison_results[match_columns].sum(axis=1)
                
                time.sleep(1)  # Simulate processing time
        
        # Clear the loading widget and display results
        clear_output(wait=True)
        
        # Show summary
        print(f"Comparison complete! Found differences in {len(comparison_results[comparison_results['total_mismatches'] > 0])} records.")
        
        return comparison_results, distribution_df
    
    except Exception as e:
        # Clear the loading widget and show error
        clear_output(wait=True)
        print(f"Error during comparison: {str(e)}")
        raise
    finally:
        # Ensure loading widget is cleared
        clear_output(wait=True)

# Run the comparison with overlay
comparison_results, distribution_summary = compare_datasets_with_overlay(data1_processed, data2_processed)

# Display the results
print("Comparison Results (showing records with differences):")
display(comparison_results[comparison_results['total_mismatches'] > 0].head(10))

print("\nDistribution Differences Summary:")
display(distribution_summary)

## Visualization of Results

Create visualizations to display the comparison results once processing is complete.

In [None]:
# Set style for better visualizations
sns.set(style="whitegrid")

# Function to create visualizations for comparison results
def visualize_comparison_results(comparison_df, dist_summary, df1, df2):
    """Create various visualizations for the comparison results"""
    
    # Create loading widget for visualizations
    loading_widget = create_loading_widget("Generating visualizations. Please wait...")
    display(loading_widget)
    
    try:
        # 1. Distribution of mismatches
        plt.figure(figsize=(10, 6))
        sns.histplot(comparison_df['total_mismatches'], kde=True)
        plt.title('Distribution of Mismatches per Record')
        plt.xlabel('Number of Mismatches')
        plt.ylabel('Count')
        time.sleep(1)  # Simulate processing time
        clear_output(wait=True)
        plt.show()
        
        # 2. Comparison of Value distributions
        plt.figure(figsize=(12, 6))
        
        # Combine data for side-by-side comparison
        combined_data = pd.concat([
            df1[['Value', 'Source']],
            df2[['Value', 'Source']]
        ])
        
        # Create the violin plot
        sns.violinplot(x='Source', y='Value', data=combined_data)
        plt.title('Comparison of Value Distributions')
        plt.tight_layout()
        time.sleep(1)  # Simulate processing time
        clear_output(wait=True)
        plt.show()
        
        # 3. Bar chart of category distributions
        plt.figure(figsize=(14, 6))
        
        # Count categories in each dataset
        cat_counts1 = df1['Category'].value_counts().reset_index()
        cat_counts1.columns = ['Category', 'Count']
        cat_counts1['Source'] = 'Dataset 1'
        
        cat_counts2 = df2['Category'].value_counts().reset_index()
        cat_counts2.columns = ['Category', 'Count']
        cat_counts2['Source'] = 'Dataset 2'
        
        # Combine counts
        combined_cats = pd.concat([cat_counts1, cat_counts2])
        
        # Create grouped bar chart
        sns.barplot(x='Category', y='Count', hue='Source', data=combined_cats)
        plt.title('Category Distribution Comparison')
        plt.tight_layout()
        time.sleep(1)  # Simulate processing time
        clear_output(wait=True)
        plt.show()
        
        # 4. Status distribution comparison
        plt.figure(figsize=(14, 6))
        
        # Count statuses in each dataset
        status_counts1 = df1['Status'].value_counts().reset_index()
        status_counts1.columns = ['Status', 'Count']
        status_counts1['Source'] = 'Dataset 1'
        
        status_counts2 = df2['Status'].value_counts().reset_index()
        status_counts2.columns = ['Status', 'Count']
        status_counts2['Source'] = 'Dataset 2'
        
        # Combine counts
        combined_status = pd.concat([status_counts1, status_counts2])
        
        # Create grouped bar chart
        sns.barplot(x='Status', y='Count', hue='Source', data=combined_status)
        plt.title('Status Distribution Comparison')
        plt.tight_layout()
        time.sleep(1)  # Simulate processing time
        clear_output(wait=True)
        plt.show()
        
        # 5. Correlation of differences in numerical values
        if 'Value_diff' in comparison_df.columns:
            plt.figure(figsize=(8, 8))
            
            # Create a scatter plot of original values vs differences
            plt.scatter(
                df1.set_index('ID').loc[comparison_df['ID'], 'Value'],
                comparison_df['Value_diff'],
                alpha=0.5
            )
            plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
            plt.title('Value Differences vs Original Values')
            plt.xlabel('Original Value (Dataset 1)')
            plt.ylabel('Difference (Dataset 1 - Dataset 2)')
            plt.tight_layout()
            time.sleep(1)  # Simulate processing time
            clear_output(wait=True)
            plt.show()
        
        print("Visualization complete!")
    
    except Exception as e:
        clear_output(wait=True)
        print(f"Error during visualization: {str(e)}")
    finally:
        clear_output(wait=True)

# Run the visualization function
visualize_comparison_results(comparison_results, distribution_summary, data1_processed, data2_processed)

## Conclusion

In this notebook, we implemented a custom comparison function with a loading/processing overlay that:

1. Compares two datasets across multiple dimensions (numerical differences, categorical mismatches)
2. Shows a progress indicator during the comparison process
3. Creates visualizations to help understand the differences between datasets
4. Provides a summary of the differences found

This approach is particularly useful for:
- Large datasets where comparisons might take significant time
- Datasets with complex structures that require multiple comparison steps
- Situations where users need feedback on processing status
- Analysis requiring visual interpretation of differences

Further improvements could include:
- Adding more sophisticated statistical comparisons
- Implementing interactive widgets for exploring differences
- Adding export options for the comparison results
- Optimizing the comparison algorithm for very large datasets