# Memory Optimization for Pediatric Appendicitis Model - Part 2

This notebook continues our memory optimization strategies, focusing on model performance comparison and advanced techniques.

In [None]:
# Import necessary libraries (from Part 1)
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from memory_profiler import profile, memory_usage
import psutil
import gc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

# Add project root to path
sys.path.append('..')

# Import project modules
from src.data_processing.preprocess import load_data, handle_missing_values, optimize_memory

# Set plot styling
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 14

## 5. Model Performance Comparison

Let's compare model performance with and without memory optimization techniques.

In [None]:
# Load data (from memory_optimization_part1.ipynb)
data_path = '../DATA/synthetic_appendicitis_data.csv'
df = pd.read_csv(data_path)

# Basic optimization function
def optimize_datatypes(df, verbose=True):
    """Optimize DataFrame data types to reduce memory usage"""
    result = df.copy()
    original_memory = result.memory_usage(deep=True).sum() / (1024 * 1024)
    
    # Optimize integers
    int_columns = result.select_dtypes(include=['int']).columns
    for col in int_columns:
        col_min = result[col].min()
        col_max = result[col].max()
        
        # Find the appropriate int type
        if col_min >= 0:
            if col_max < 2**8:
                result[col] = result[col].astype(np.uint8)
            elif col_max < 2**16:
                result[col] = result[col].astype(np.uint16)
            elif col_max < 2**32:
                result[col] = result[col].astype(np.uint32)
        else:
            if col_min > -2**7 and col_max < 2**7:
                result[col] = result[col].astype(np.int8)
            elif col_min > -2**15 and col_max < 2**15:
                result[col] = result[col].astype(np.int16)
            elif col_min > -2**31 and col_max < 2**31:
                result[col] = result[col].astype(np.int32)
    
    # Optimize floats
    float_columns = result.select_dtypes(include=['float']).columns
    for col in float_columns:
        result[col] = pd.to_numeric(result[col], downcast='float')
    
    # Optimize objects (strings)
    categorical_threshold = 0.5  # Threshold for categorical conversion (50% unique values)
    object_columns = result.select_dtypes(include=['object']).columns
    for col in object_columns:
        unique_count = len(result[col].unique())
        total_count = len(result[col])
        if unique_count / total_count < categorical_threshold:
            result[col] = result[col].astype('category')
    
    # Calculate memory savings
    optimized_memory = result.memory_usage(deep=True).sum() / (1024 * 1024)
    savings = original_memory - optimized_memory
    savings_percent = (savings / original_memory) * 100
    
    if verbose:
        print(f"Original memory usage: {original_memory:.2f} MB")
        print(f"Optimized memory usage: {optimized_memory:.2f} MB")
        print(f"Memory savings: {savings:.2f} MB ({savings_percent:.1f}%)")
    
    return result

In [None]:
# Prepare datasets for comparison
print("Preparing datasets for model comparison...")

# Original dataset
X_orig = df[['Age', 'Temperature', 'WBC', 'CRP', 'Pain_Duration', 'Neutrophil_Percent']]
y_orig = df['Appendicitis']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_orig, y_orig, test_size=0.3, random_state=42, stratify=y_orig
)

# Optimized dataset (data type optimization)
df_opt = optimize_datatypes(df, verbose=False)
X_opt = df_opt[['Age', 'Temperature', 'WBC', 'CRP', 'Pain_Duration', 'Neutrophil_Percent']]
y_opt = df_opt['Appendicitis']
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(
    X_opt, y_opt, test_size=0.3, random_state=42, stratify=y_opt
)

# Feature-selected dataset (top 4 features)
selector = SelectKBest(f_classif, k=4)
X_train_fs = selector.fit_transform(X_train_opt, y_train_opt)
X_test_fs = selector.transform(X_test_opt)
selected_features = X_opt.columns[selector.get_support(indices=True)]
print(f"Selected features: {', '.join(selected_features)}")

In [None]:
# Define function to benchmark model training and evaluation
def benchmark_model(X_train, X_test, y_train, y_test, description=""):
    """Benchmark RandomForest model training and evaluation memory/time"""
    # Collect garbage before benchmark
    gc.collect()
    
    print(f"\nBenchmarking {description}")
    
    # Get initial memory usage
    mem_before = psutil.Process().memory_info().rss / (1024 * 1024)
    
    # Training time and memory
    train_start = time.time()
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    train_time = time.time() - train_start
    
    # Get memory after training
    mem_after_train = psutil.Process().memory_info().rss / (1024 * 1024)
    train_memory = mem_after_train - mem_before
    
    # Prediction time and performance
    predict_start = time.time()
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    predict_time = time.time() - predict_start
    
    # Get memory after prediction
    mem_after_predict = psutil.Process().memory_info().rss / (1024 * 1024)
    predict_memory = mem_after_predict - mem_after_train
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    # Get model size
    model_size_mb = get_size(model) / (1024 * 1024)
    
    # Print results
    print(f"Model Size: {model_size_mb:.2f} MB")
    print(f"Training Time: {train_time:.4f} seconds")
    print(f"Training Memory: {train_memory:.2f} MB")
    print(f"Prediction Time: {predict_time:.4f} seconds")
    print(f"Prediction Memory: {predict_memory:.2f} MB")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC: {auc:.4f}")
    
    return {
        'description': description,
        'model_size': model_size_mb,
        'train_time': train_time,
        'train_memory': train_memory,
        'predict_time': predict_time,
        'predict_memory': predict_memory,
        'accuracy': accuracy,
        'auc': auc
    }

In [None]:
# Define get_size function
def get_size(obj, seen=None):
    """Recursively find the size of objects in bytes"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

In [None]:
# Benchmark models with different memory optimization techniques
results = []

# Benchmark original dataset model
results.append(benchmark_model(X_train_orig, X_test_orig, y_train_orig, y_test_orig, "Original Dataset"))

# Benchmark optimized datatypes model
results.append(benchmark_model(X_train_opt, X_test_opt, y_train_opt, y_test_opt, "Optimized Datatypes"))

# Benchmark feature-selected model
results.append(benchmark_model(X_train_fs, X_test_fs, y_train_opt, y_test_opt, "Feature Selection (4 features)"))

In [None]:
# Create DataFrame with results
results_df = pd.DataFrame(results)
print("\nComparison of optimization techniques:")
print(results_df)

In [None]:
# Plot memory metrics comparison
plt.figure(figsize=(14, 7))
bar_width = 0.35
index = np.arange(len(results_df['description']))

plt.bar(index, results_df['model_size'], bar_width, label='Model Size (MB)', color='#FF9999')
plt.bar(index + bar_width, results_df['train_memory'], bar_width, label='Training Memory (MB)', color='#66B2FF')

plt.xlabel('Optimization Technique')
plt.ylabel('Memory Usage (MB)')
plt.title('Memory Usage Comparison Across Optimization Techniques')
plt.xticks(index + bar_width / 2, results_df['description'])
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot performance comparison
plt.figure(figsize=(14, 7))

plt.subplot(1, 2, 1)
plt.bar(results_df['description'], results_df['train_time'], color='#FF9999')
plt.title('Training Time Comparison')
plt.ylabel('Time (seconds)')
plt.xticks(rotation=45, ha='right')

plt.subplot(1, 2, 2)
plt.bar(results_df['description'], results_df['auc'], color='#66B2FF')
plt.title('Model Performance (AUC) Comparison')
plt.ylabel('AUC Score')
plt.xticks(rotation=45, ha='right')
plt.ylim(0.8, 1.0)  # Adjust as needed based on your AUC scores

plt.tight_layout()
plt.show()

## 6. Optimal Model Selection

Let's analyze the trade-offs between memory usage and model performance to select the optimal approach.

In [None]:
# Calculate memory-performance efficiency
results_df['memory_efficiency'] = results_df['auc'] / results_df['model_size']
results_df['time_efficiency'] = results_df['auc'] / results_df['train_time']

print("Memory-Performance Efficiency (AUC per MB):")
print(results_df[['description', 'memory_efficiency']].sort_values('memory_efficiency', ascending=False))

print("\nTime-Performance Efficiency (AUC per second):")
print(results_df[['description', 'time_efficiency']].sort_values('time_efficiency', ascending=False))

In [None]:
# Plot efficiency metrics
plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
bars = plt.bar(results_df['description'], results_df['memory_efficiency'], color='#4CAF50')
plt.title('Memory Efficiency (AUC per MB)')
plt.ylabel('Efficiency')
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.4f}', ha='center', va='bottom')
plt.xticks(rotation=45, ha='right')

plt.subplot(2, 1, 2)
bars = plt.bar(results_df['description'], results_df['time_efficiency'], color='#2196F3')
plt.title('Time Efficiency (AUC per second)')
plt.ylabel('Efficiency')
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{height:.4f}', ha='center', va='bottom')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

## 7. Saving and Loading Optimized Models

Let's train and save our most efficient model for future use.

In [None]:
# Identify the most memory-efficient model approach
most_efficient_approach = results_df.loc[results_df['memory_efficiency'].idxmax()]
print(f"Most memory-efficient approach: {most_efficient_approach['description']}")

# Train and save the optimal model
if most_efficient_approach['description'] == "Feature Selection (4 features)":
    # Train on feature-selected data
    optimal_model = RandomForestClassifier(n_estimators=100, random_state=42)
    optimal_model.fit(X_train_fs, y_train_opt)
    
    # Save feature selector and model
    print("Saving feature selector and optimized model...")
    os.makedirs('../models', exist_ok=True)
    with open('../models/feature_selector.pkl', 'wb') as f:
        pickle.dump(selector, f)
    
    with open('../models/memory_optimized_model.pkl', 'wb') as f:
        pickle.dump(optimal_model, f)
    
    print("Feature selector and model saved successfully.")
    
elif most_efficient_approach['description'] == "Optimized Datatypes":
    # Train on optimized-datatypes data
    optimal_model = RandomForestClassifier(n_estimators=100, random_state=42)
    optimal_model.fit(X_train_opt, y_train_opt)
    
    # Save model
    print("Saving optimized model...")
    os.makedirs('../models', exist_ok=True)
    with open('../models/memory_optimized_model.pkl', 'wb') as f:
        pickle.dump(optimal_model, f)
    
    print("Model saved successfully.")
else:
    # Train on original data
    optimal_model = RandomForestClassifier(n_estimators=100, random_state=42)
    optimal_model.fit(X_train_orig, y_train_orig)
    
    # Save model
    print("Saving model...")
    os.makedirs('../models', exist_ok=True)
    with open('../models/memory_optimized_model.pkl', 'wb') as f:
        pickle.dump(optimal_model, f)
    
    print("Model saved successfully.")

In [None]:
# Test loading the saved model and measure memory usage
gc.collect()
mem_before = psutil.Process().memory_info().rss / (1024 * 1024)

# Load the model
with open('../models/memory_optimized_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

mem_after = psutil.Process().memory_info().rss / (1024 * 1024)
loading_memory = mem_after - mem_before

print(f"Memory used to load the model: {loading_memory:.2f} MB")
print(f"Loaded model size: {get_size(loaded_model) / (1024 * 1024):.2f} MB")

## 8. Conclusion and Best Practices

Let's summarize our findings and recommend best practices for memory optimization in this project.

In [None]:
print("=== Memory Optimization Best Practices ===\n")
print("1. Datatype Optimization:")
print("   - Use the smallest possible datatypes for numeric columns")
print("   - Convert categorical string columns to category dtype")
print("   - Consider using sparse matrices for highly sparse data")

print("\n2. Feature Selection:")
print("   - Identify and remove redundant or low-importance features")
print(f"   - For this dataset, the top features are: {', '.join(selected_features)}")

print("\n3. Model Size Considerations:")
print("   - Certain models like Random Forests can be memory-intensive")
print("   - Consider limiting tree depth or number of estimators if memory is critical")
print("   - Simpler models may offer better memory-performance trade-offs")

print("\n4. Memory-Performance Trade-offs:")
print(f"   - The most memory-efficient approach is {most_efficient_approach['description']}")
print(f"   - This approach offers {most_efficient_approach['memory_efficiency']:.4f} AUC per MB, with an AUC of {most_efficient_approach['auc']:.4f}")

print("\n5. Implementation Recommendations:")
print("   - Implement data type optimization in the preprocessing pipeline")
print("   - Use feature selection when loading data for model training/inference")
print("   - Consider chunking or batch processing for very large datasets")
print("   - Monitor memory usage in production and adjust as needed")