# Pandas Engine Testing and Optimization

This notebook demonstrates pandas performance characteristics, memory optimization techniques, and best practices for CSV processing in the ML Pipeline Framework.

## Topics Covered:
- 📊 CSV data loading with pandas
- 💾 Memory usage profiling and optimization
- ⏱️ Performance benchmarks for common operations
- 🔧 Chunked processing for large datasets
- 🎯 Optimization techniques (categorical dtypes, etc.)
- 📈 Scalability analysis

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import psutil
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

# ML Pipeline Framework imports - basic ones only
from src.utils.logging_config import setup_logging

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"System memory: {psutil.virtual_memory().total / 1024**3:.1f} GB")
print(f"Available memory: {psutil.virtual_memory().available / 1024**3:.1f} GB")

## Test Data Generation

In [None]:
def generate_test_data(n_rows=100000, save_path=None):
    """Generate realistic test data for performance testing."""
    
    np.random.seed(42)
    
    # Generate data with different data types
    data = {
        # Integer columns (different ranges for dtype optimization testing)
        'id': range(n_rows),
        'small_int': np.random.randint(0, 100, n_rows),  # Can be uint8
        'medium_int': np.random.randint(0, 10000, n_rows),  # Can be uint16
        'large_int': np.random.randint(0, 1000000, n_rows),  # uint32
        
        # Float columns
        'price': np.random.lognormal(4, 1, n_rows),  # Price-like distribution
        'score': np.random.normal(75, 15, n_rows),  # Score with normal distribution
        'ratio': np.random.beta(2, 5, n_rows),  # Ratio between 0-1
        
        # String columns (different cardinalities)
        'category_high': np.random.choice(['A', 'B', 'C'], n_rows),  # Low cardinality - good for categorical
        'category_medium': np.random.choice([f'Cat_{i}' for i in range(50)], n_rows),  # Medium cardinality
        'category_low': np.random.choice([f'Item_{i}' for i in range(n_rows//10)], n_rows),  # High cardinality
        
        # Boolean column
        'is_active': np.random.choice([True, False], n_rows),
        
        # Date column
        'created_date': pd.date_range('2020-01-01', periods=n_rows, freq='H'),
        
        # Text column
        'description': [f'Description for item {i} with some text content' for i in range(n_rows)]
    }
    
    df = pd.DataFrame(data)
    
    # Add some missing values
    missing_cols = ['score', 'description', 'category_medium']
    for col in missing_cols:
        missing_indices = np.random.choice(df.index, size=int(0.05 * len(df)), replace=False)
        df.loc[missing_indices, col] = np.nan
    
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_csv(save_path, index=False)
        print(f"Test data saved to {save_path}")
    
    return df

# Generate test datasets of different sizes
test_sizes = [10000, 50000, 100000, 500000]
test_files = {}

for size in test_sizes:
    file_path = f'../data/test_data_{size}.csv'
    test_files[size] = file_path
    
    if not os.path.exists(file_path):
        print(f"Generating test data with {size:,} rows...")
        df = generate_test_data(size, file_path)
    else:
        print(f"Test data with {size:,} rows already exists")

print("\nTest files created:")
for size, path in test_files.items():
    file_size = os.path.getsize(path) / 1024**2
    print(f"  {size:,} rows: {path} ({file_size:.1f} MB)")

## Memory Usage Analysis

In [None]:
def analyze_memory_usage(df, title="DataFrame Memory Analysis"):
    """Comprehensive memory usage analysis."""
    
    print(f"\n{title}")
    print("=" * len(title))
    
    # Overall memory usage
    total_memory = df.memory_usage(deep=True).sum()
    print(f"Total memory usage: {total_memory / 1024**2:.2f} MB")
    print(f"Rows: {len(df):,}, Columns: {len(df.columns)}")
    print(f"Memory per row: {total_memory / len(df):.1f} bytes")
    
    # Memory usage by column
    memory_usage = df.memory_usage(deep=True)
    memory_df = pd.DataFrame({
        'Column': ['Index'] + df.columns.tolist(),
        'Memory_MB': memory_usage / 1024**2,
        'Memory_Percentage': (memory_usage / total_memory) * 100,
        'Dtype': ['Index'] + [str(dtype) for dtype in df.dtypes]
    })
    
    memory_df = memory_df.sort_values('Memory_MB', ascending=False)
    
    print("\nMemory usage by column:")
    print(memory_df.to_string(index=False, float_format='%.2f'))
    
    return memory_df

# Load and analyze a sample dataset
sample_df = pd.read_csv(test_files[100000])
memory_analysis = analyze_memory_usage(sample_df, "Original Data Memory Usage")

## Data Type Optimization

In [None]:
def optimize_dtypes(df):
    """Optimize data types for memory efficiency."""
    
    optimized_df = df.copy()
    
    print("Optimizing data types...")
    
    for col in optimized_df.columns:
        col_type = optimized_df[col].dtype
        
        # Skip datetime columns
        if pd.api.types.is_datetime64_any_dtype(optimized_df[col]):
            continue
        
        # Optimize integers
        if pd.api.types.is_integer_dtype(optimized_df[col]):
            min_val = optimized_df[col].min()
            max_val = optimized_df[col].max()
            
            if min_val >= 0:  # Unsigned integers
                if max_val < 255:
                    optimized_df[col] = optimized_df[col].astype('uint8')
                elif max_val < 65535:
                    optimized_df[col] = optimized_df[col].astype('uint16')
                elif max_val < 4294967295:
                    optimized_df[col] = optimized_df[col].astype('uint32')
            else:  # Signed integers
                if min_val >= -128 and max_val < 128:
                    optimized_df[col] = optimized_df[col].astype('int8')
                elif min_val >= -32768 and max_val < 32768:
                    optimized_df[col] = optimized_df[col].astype('int16')
                elif min_val >= -2147483648 and max_val < 2147483648:
                    optimized_df[col] = optimized_df[col].astype('int32')
        
        # Optimize floats
        elif pd.api.types.is_float_dtype(optimized_df[col]):
            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='float')
        
        # Convert to categorical for low cardinality strings
        elif pd.api.types.is_object_dtype(optimized_df[col]):
            if optimized_df[col].notna().sum() > 0:  # Skip if all NaN
                unique_count = optimized_df[col].nunique()
                total_count = len(optimized_df[col])
                
                # Convert to categorical if less than 50% unique values and < 1000 categories
                if unique_count / total_count < 0.5 and unique_count < 1000:
                    optimized_df[col] = optimized_df[col].astype('category')
                    print(f"  {col}: object -> category ({unique_count} categories)")
    
    return optimized_df

# Optimize the sample dataset
optimized_df = optimize_dtypes(sample_df)
optimized_memory = analyze_memory_usage(optimized_df, "Optimized Data Memory Usage")

# Calculate memory savings
original_memory = sample_df.memory_usage(deep=True).sum()
new_memory = optimized_df.memory_usage(deep=True).sum()
savings = (original_memory - new_memory) / original_memory * 100

print(f"\n📊 Memory Optimization Results:")
print(f"Original memory: {original_memory / 1024**2:.2f} MB")
print(f"Optimized memory: {new_memory / 1024**2:.2f} MB")
print(f"Memory savings: {savings:.1f}%")

## Performance Benchmarking

In [None]:
def benchmark_operation(df, operation_name, operation_func, *args, **kwargs):
    """Benchmark a pandas operation."""
    
    # Get initial memory usage
    process = psutil.Process()
    initial_memory = process.memory_info().rss / 1024**2
    
    # Time the operation
    start_time = time.time()
    result = operation_func(df, *args, **kwargs)
    end_time = time.time()
    
    # Get peak memory usage
    peak_memory = process.memory_info().rss / 1024**2
    memory_used = peak_memory - initial_memory
    
    duration = end_time - start_time
    
    return {
        'operation': operation_name,
        'duration_seconds': duration,
        'memory_used_mb': memory_used,
        'rows_processed': len(df),
        'rows_per_second': len(df) / duration if duration > 0 else float('inf')
    }

# Define benchmark operations
def groupby_operation(df):
    return df.groupby('category_high')['price'].agg(['mean', 'sum', 'count'])

def join_operation(df):
    # Self join for demonstration
    lookup_df = df[['id', 'category_high']].head(1000)
    return df.merge(lookup_df, on='id', how='left', suffixes=('', '_lookup'))

def aggregation_operation(df):
    return df.agg({
        'price': ['mean', 'std', 'min', 'max'],
        'score': ['mean', 'median'],
        'small_int': 'sum'
    })

def filter_operation(df):
    return df[(df['price'] > df['price'].quantile(0.5)) & (df['is_active'] == True)]

def sort_operation(df):
    return df.sort_values(['price', 'score'], ascending=[False, True])

# Run benchmarks on different dataset sizes
benchmark_results = []
operations = [
    ('GroupBy', groupby_operation),
    ('Join', join_operation),
    ('Aggregation', aggregation_operation),
    ('Filter', filter_operation),
    ('Sort', sort_operation)
]

print("Running performance benchmarks...")
print("This may take a few minutes...\n")

for size in [10000, 50000, 100000][:2]:  # Limit to smaller sizes for demo
    print(f"Benchmarking with {size:,} rows...")
    
    # Load data
    df = pd.read_csv(test_files[size])
    
    # Optimize dtypes
    df_optimized = optimize_dtypes(df)
    
    for op_name, op_func in operations:
        try:
            # Benchmark original dataframe
            result_orig = benchmark_operation(df, f"{op_name}_Original", op_func)
            result_orig['dataset_size'] = size
            result_orig['optimization'] = 'Original'
            benchmark_results.append(result_orig)
            
            # Benchmark optimized dataframe
            result_opt = benchmark_operation(df_optimized, f"{op_name}_Optimized", op_func)
            result_opt['dataset_size'] = size
            result_opt['optimization'] = 'Optimized'
            benchmark_results.append(result_opt)
            
            print(f"  {op_name}: {result_orig['duration_seconds']:.3f}s -> {result_opt['duration_seconds']:.3f}s")
            
        except Exception as e:
            print(f"  {op_name}: Error - {e}")

# Convert results to DataFrame for analysis
benchmark_df = pd.DataFrame(benchmark_results)
print("\n📊 Benchmark Results Summary:")
print(benchmark_df.groupby(['operation', 'optimization'])['duration_seconds'].mean().unstack())

## Chunked Processing Demonstration

In [None]:
def process_csv_in_chunks(file_path, chunk_size=10000, operation_func=None):
    """Demonstrate chunked processing for large CSV files."""
    
    print(f"Processing {file_path} in chunks of {chunk_size:,} rows...")
    
    start_time = time.time()
    chunk_count = 0
    total_rows = 0
    results = []
    
    # Process in chunks
    chunk_reader = pd.read_csv(file_path, chunksize=chunk_size)
    
    for chunk in chunk_reader:
        chunk_count += 1
        total_rows += len(chunk)
        
        # Apply operation to chunk if provided
        if operation_func:
            chunk_result = operation_func(chunk)
            results.append(chunk_result)
        
        # Memory monitoring
        if chunk_count % 10 == 0:
            memory_usage = psutil.Process().memory_info().rss / 1024**2
            print(f"  Processed {chunk_count} chunks ({total_rows:,} rows), Memory: {memory_usage:.1f} MB")
    
    duration = time.time() - start_time
    
    print(f"✅ Completed: {chunk_count} chunks, {total_rows:,} total rows in {duration:.2f}s")
    print(f"   Throughput: {total_rows/duration:,.0f} rows/second")
    
    return results

# Example: Calculate summary statistics using chunked processing
def chunk_summary_stats(chunk):
    """Calculate summary statistics for a chunk."""
    return {
        'row_count': len(chunk),
        'price_mean': chunk['price'].mean(),
        'price_sum': chunk['price'].sum(),
        'active_count': chunk['is_active'].sum()
    }

# Test chunked processing
chunk_results = process_csv_in_chunks(
    test_files[100000], 
    chunk_size=5000, 
    operation_func=chunk_summary_stats
)

# Aggregate results from all chunks
if chunk_results:
    total_rows = sum(r['row_count'] for r in chunk_results)
    weighted_price_mean = sum(r['price_mean'] * r['row_count'] for r in chunk_results) / total_rows
    total_price_sum = sum(r['price_sum'] for r in chunk_results)
    total_active = sum(r['active_count'] for r in chunk_results)
    
    print(f"\n📈 Aggregated Results:")
    print(f"Total rows processed: {total_rows:,}")
    print(f"Average price: ${weighted_price_mean:.2f}")
    print(f"Total price sum: ${total_price_sum:,.2f}")
    print(f"Active records: {total_active:,} ({total_active/total_rows*100:.1f}%)")

## CSV Connector Integration

In [None]:
# Test basic CSV processing since advanced CSV connector may not be available
csv_config = {
    'file_paths': [test_files[50000]],
    'chunk_size': 10000,
    'optimize_dtypes': True,
    'validate_headers': True
}

print("Testing CSV processing with pandas...")

# Basic CSV operations
file_path = test_files[50000]
print(f"File path: {file_path}")

# Get file info manually
file_size = os.path.getsize(file_path) / 1024**2
print(f"File size: {file_size:.1f} MB")

# Load data using pandas directly
start_time = time.time()
df_test = pd.read_csv(file_path)
load_time = time.time() - start_time

print(f"\n📊 Data loaded:")
print(f"  Rows: {len(df_test):,}")
print(f"  Columns: {len(df_test.columns)}")
print(f"  Memory: {df_test.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  Load time: {load_time:.3f}s")

# Test chunked reading manually
print("\nTesting chunked reading...")
chunk_count = 0
chunk_size = 5000

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    chunk_count += 1
    if chunk_count <= 3:  # Show first 3 chunks
        print(f"  Chunk {chunk_count}: {len(chunk)} rows, {chunk.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    elif chunk_count == 4:
        print("  ...")
    if chunk_count >= 10:  # Limit for demo
        break

print(f"Processed {chunk_count} chunks")

## Scalability Analysis

In [None]:
# Analyze how pandas performance scales with data size
scalability_results = []

print("Analyzing pandas scalability...")

for size in [10000, 50000, 100000]:
    print(f"\nTesting with {size:,} rows...")
    
    # Load data
    start_time = time.time()
    df = pd.read_csv(test_files[size])
    load_time = time.time() - start_time
    
    # Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    
    # Simple operations timing
    ops_start = time.time()
    
    # Basic operations
    summary = df.describe()
    filtered = df[df['price'] > df['price'].median()]
    grouped = df.groupby('category_high')['price'].mean()
    
    ops_time = time.time() - ops_start
    
    result = {
        'rows': size,
        'load_time': load_time,
        'memory_mb': memory_mb,
        'operations_time': ops_time,
        'memory_per_row': memory_mb / size * 1024,  # KB per row
        'load_throughput': size / load_time
    }
    
    scalability_results.append(result)
    
    print(f"  Load time: {load_time:.3f}s ({size/load_time:,.0f} rows/s)")
    print(f"  Memory usage: {memory_mb:.1f} MB ({memory_mb/size*1024:.1f} KB/row)")
    print(f"  Operations time: {ops_time:.3f}s")

# Create scalability DataFrame
scalability_df = pd.DataFrame(scalability_results)

print("\n📈 Scalability Analysis:")
print(scalability_df.to_string(index=False, float_format='%.3f'))

## Performance Visualization

In [None]:
# Create performance visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Pandas Performance Analysis', fontsize=16)

# 1. Memory usage vs dataset size
axes[0, 0].plot(scalability_df['rows'], scalability_df['memory_mb'], 'o-', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Number of Rows')
axes[0, 0].set_ylabel('Memory Usage (MB)')
axes[0, 0].set_title('Memory Usage vs Dataset Size')
axes[0, 0].grid(True, alpha=0.3)

# 2. Load throughput vs dataset size
axes[0, 1].plot(scalability_df['rows'], scalability_df['load_throughput'], 's-', 
                color='orange', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('Number of Rows')
axes[0, 1].set_ylabel('Load Throughput (rows/second)')
axes[0, 1].set_title('CSV Load Throughput')
axes[0, 1].grid(True, alpha=0.3)

# 3. Memory efficiency (KB per row)
axes[1, 0].bar(range(len(scalability_df)), scalability_df['memory_per_row'], 
               color='green', alpha=0.7)
axes[1, 0].set_xlabel('Dataset Size')
axes[1, 0].set_ylabel('Memory per Row (KB)')
axes[1, 0].set_title('Memory Efficiency')
axes[1, 0].set_xticks(range(len(scalability_df)))
axes[1, 0].set_xticklabels([f"{int(rows/1000)}K" for rows in scalability_df['rows']])

# 4. Operations performance comparison
if benchmark_results:
    benchmark_summary = pd.DataFrame(benchmark_results)
    
    # Group by operation and optimization
    perf_comparison = benchmark_summary.groupby(['operation', 'optimization'])['duration_seconds'].mean().unstack()
    
    if not perf_comparison.empty:
        perf_comparison.plot(kind='bar', ax=axes[1, 1], alpha=0.8)
        axes[1, 1].set_xlabel('Operation')
        axes[1, 1].set_ylabel('Duration (seconds)')
        axes[1, 1].set_title('Performance: Original vs Optimized')
        axes[1, 1].tick_params(axis='x', rotation=45)
        axes[1, 1].legend(title='Data Types')
    else:
        axes[1, 1].text(0.5, 0.5, 'No benchmark data available', ha='center', va='center')
        axes[1, 1].set_title('Performance Comparison')
else:
    axes[1, 1].text(0.5, 0.5, 'No benchmark data available', ha='center', va='center')
    axes[1, 1].set_title('Performance Comparison')

plt.tight_layout()
plt.show()

# Memory optimization visualization
if 'memory_analysis' in locals() and 'optimized_memory' in locals():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Before optimization
    memory_analysis_viz = memory_analysis[memory_analysis['Column'] != 'Index'].head(10)
    ax1.barh(memory_analysis_viz['Column'], memory_analysis_viz['Memory_MB'])
    ax1.set_xlabel('Memory Usage (MB)')
    ax1.set_title('Memory Usage by Column (Original)')
    ax1.grid(True, alpha=0.3)
    
    # After optimization
    optimized_memory_viz = optimized_memory[optimized_memory['Column'] != 'Index'].head(10)
    ax2.barh(optimized_memory_viz['Column'], optimized_memory_viz['Memory_MB'], color='green')
    ax2.set_xlabel('Memory Usage (MB)')
    ax2.set_title('Memory Usage by Column (Optimized)')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Best Practices Summary

In [None]:
print("\n🎯 PANDAS OPTIMIZATION BEST PRACTICES")
print("=" * 50)

print("\n1. DATA TYPE OPTIMIZATION:")
print("   • Use smallest possible integer types (int8, int16, uint8, uint16)")
print("   • Convert float64 to float32 when precision allows")
print("   • Use categorical dtype for low-cardinality string columns")
print("   • Convert boolean-like strings to actual boolean type")

print("\n2. MEMORY MANAGEMENT:")
print("   • Monitor memory usage with df.memory_usage(deep=True)")
print("   • Use chunked processing for files > available memory")
print("   • Delete intermediate DataFrames when not needed")
print("   • Consider using copy=False for views when safe")

print("\n3. PERFORMANCE OPTIMIZATION:")
print("   • Use vectorized operations instead of loops")
print("   • Leverage pandas' built-in functions (groupby, merge, etc.)")
print("   • Set categorical columns before groupby operations")
print("   • Use appropriate chunk sizes (10K-100K rows typically optimal)")

print("\n4. CSV READING OPTIMIZATION:")
print("   • Specify dtypes explicitly to avoid inference overhead")
print("   • Use parse_dates for known date columns")
print("   • Consider usecols to read only needed columns")
print("   • Use compression (gzip, zip) for storage and I/O efficiency")

print("\n5. WHEN TO CONSIDER ALTERNATIVES:")
print("   • Data > 5GB: Consider Polars or DuckDB")
print("   • Complex SQL-like operations: Consider DuckDB")
print("   • Need lazy evaluation: Consider Polars")
print("   • Distributed processing: Consider PySpark")

# Performance summary based on our tests
if scalability_results:
    avg_load_speed = np.mean([r['load_throughput'] for r in scalability_results])
    avg_memory_per_row = np.mean([r['memory_per_row'] for r in scalability_results])
    
    print(f"\n📊 PERFORMANCE SUMMARY (This System):")
    print(f"   • Average CSV load speed: {avg_load_speed:,.0f} rows/second")
    print(f"   • Average memory per row: {avg_memory_per_row:.1f} KB")
    print(f"   • Recommended chunk size: {10000:,} - {50000:,} rows")
    
    if savings and 'savings' in locals():
        print(f"   • Memory optimization savings: {savings:.1f}%")

print("\n✅ Pandas engine analysis complete!")