# Memory Optimization for Pediatric Appendicitis Model

This notebook focuses on analyzing and optimizing memory usage in the pediatric appendicitis diagnosis model. We'll explore various techniques to improve memory efficiency while maintaining model performance.

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from memory_profiler import profile, memory_usage
import psutil
import gc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Add project root to path
sys.path.append('..')

# Import project modules
from src.data_processing.preprocess import load_data, handle_missing_values, optimize_memory

# Set plot styling
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 14

## 1. Memory Profiling Functions

Let's define some functions to help us measure and profile memory usage throughout our analysis.

In [None]:
def get_size(obj, seen=None):
    """Recursively find the size of objects in bytes"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

def memory_usage_dataframe(df):
    """Calculate DataFrame memory usage in MB"""
    usage_bytes = df.memory_usage(deep=True).sum()
    usage_mb = usage_bytes / (1024 * 1024)  # Convert to MB
    return usage_mb

def profile_function(func, *args, **kwargs):
    """Profile a function's memory usage and execution time"""
    # Collect garbage before profiling
    gc.collect()
    
    # Measure memory usage
    mem_before = psutil.Process().memory_info().rss / (1024 * 1024)
    start_time = time.time()
    
    # Execute function
    result = func(*args, **kwargs)
    
    # Measure final memory and time
    end_time = time.time()
    mem_after = psutil.Process().memory_info().rss / (1024 * 1024)
    
    # Calculate metrics
    execution_time = end_time - start_time
    memory_used = mem_after - mem_before
    
    print(f"Function: {func.__name__}")
    print(f"Memory Usage: {memory_used:.2f} MB")
    print(f"Execution Time: {execution_time:.2f} seconds")
    
    return result, memory_used, execution_time

## 2. Data Loading and Profiling Memory Usage

Let's load our dataset and analyze its initial memory usage.

In [None]:
# Load data
data_path = '../DATA/synthetic_appendicitis_data.csv'
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")

# Check initial memory usage
initial_memory = memory_usage_dataframe(df)
print(f"Initial memory usage: {initial_memory:.2f} MB")

# Analyze memory usage by column
print("\nMemory usage by column:")
memory_by_column = df.memory_usage(deep=True) / (1024 * 1024)  # Convert to MB
for col in memory_by_column.index:
    print(f"{col}: {memory_by_column[col]:.2f} MB")

In [None]:
# Get data types and basic statistics
print("\nData types:")
print(df.dtypes)

# Display memory usage by data type
print("\nMemory usage by data type:")
memory_by_dtype = df.groupby(by=lambda dt: df[dt].dtype).sum() / (1024 * 1024)
print(memory_by_dtype)

## 3. Basic Memory Optimization

Let's implement some basic memory optimization techniques, such as downcasting numerical data types.

In [None]:
def optimize_datatypes(df, verbose=True):
    """Optimize DataFrame data types to reduce memory usage"""
    result = df.copy()
    original_memory = memory_usage_dataframe(result)
    
    # Optimize integers
    int_columns = result.select_dtypes(include=['int']).columns
    for col in int_columns:
        col_min = result[col].min()
        col_max = result[col].max()
        
        # Find the appropriate int type
        if col_min >= 0:
            if col_max < 2**8:
                result[col] = result[col].astype(np.uint8)
            elif col_max < 2**16:
                result[col] = result[col].astype(np.uint16)
            elif col_max < 2**32:
                result[col] = result[col].astype(np.uint32)
        else:
            if col_min > -2**7 and col_max < 2**7:
                result[col] = result[col].astype(np.int8)
            elif col_min > -2**15 and col_max < 2**15:
                result[col] = result[col].astype(np.int16)
            elif col_min > -2**31 and col_max < 2**31:
                result[col] = result[col].astype(np.int32)
    
    # Optimize floats
    float_columns = result.select_dtypes(include=['float']).columns
    for col in float_columns:
        result[col] = pd.to_numeric(result[col], downcast='float')
    
    # Optimize objects (strings)
    categorical_threshold = 0.5  # Threshold for categorical conversion (50% unique values)
    object_columns = result.select_dtypes(include=['object']).columns
    for col in object_columns:
        unique_count = len(result[col].unique())
        total_count = len(result[col])
        if unique_count / total_count < categorical_threshold:
            result[col] = result[col].astype('category')
    
    # Calculate memory savings
    optimized_memory = memory_usage_dataframe(result)
    savings = original_memory - optimized_memory
    savings_percent = (savings / original_memory) * 100
    
    if verbose:
        print(f"Original memory usage: {original_memory:.2f} MB")
        print(f"Optimized memory usage: {optimized_memory:.2f} MB")
        print(f"Memory savings: {savings:.2f} MB ({savings_percent:.1f}%)")
    
    return result, original_memory, optimized_memory, savings, savings_percent

In [None]:
# Apply basic memory optimization
df_optimized, orig_mem, opt_mem, savings, savings_pct = optimize_datatypes(df)

# Compare data types before and after optimization
print("\nOriginal data types:")
print(df.dtypes)

print("\nOptimized data types:")
print(df_optimized.dtypes)

In [None]:
# Plot memory usage comparison
plt.figure(figsize=(10, 6))
memory_data = [orig_mem, opt_mem]
labels = ['Original', 'Optimized']
colors = ['#FF9999', '#66B2FF']

bars = plt.bar(labels, memory_data, color=colors)
plt.title('Memory Usage Comparison')
plt.ylabel('Memory Usage (MB)')

# Add data labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f} MB', ha='center', va='bottom', fontweight='bold')

# Add savings annotation
plt.annotate(f'Savings: {savings:.2f} MB ({savings_pct:.1f}%)',
             xy=(1, opt_mem), xytext=(1.1, opt_mem + (orig_mem - opt_mem)/2),
             arrowprops=dict(facecolor='black', shrink=0.05, width=1.5, headwidth=8),
             fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Enhanced Memory Optimization

Let's implement additional techniques for memory optimization, focusing on feature selection and dimensionality reduction.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

# Prepare data for feature selection
X = df_optimized[['Age', 'Temperature', 'WBC', 'CRP', 'Pain_Duration', 'Neutrophil_Percent']]
y = df_optimized['Appendicitis']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Function to apply feature selection
def apply_feature_selection(X_train, X_test, y_train, k=4):
    """Apply SelectKBest to select top k features"""
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    
    # Get the names of selected features
    selected_indices = selector.get_support(indices=True)
    selected_features = X_train.columns[selected_indices]
    
    # Create DataFrames with selected features
    X_train_df = pd.DataFrame(X_train_selected, columns=selected_features)
    X_test_df = pd.DataFrame(X_test_selected, columns=selected_features)
    
    print(f"Selected top {k} features: {', '.join(selected_features)}")
    return X_train_df, X_test_df, selected_features

# Apply feature selection
X_train_selected, X_test_selected, selected_features = apply_feature_selection(X_train, X_test, y_train, k=4)

# Calculate memory savings
original_X_train_memory = memory_usage_dataframe(pd.DataFrame(X_train))
selected_X_train_memory = memory_usage_dataframe(X_train_selected)

feature_selection_savings = original_X_train_memory - selected_X_train_memory
feature_selection_savings_pct = (feature_selection_savings / original_X_train_memory) * 100

print(f"\nMemory usage with all features: {original_X_train_memory:.4f} MB")
print(f"Memory usage with selected features: {selected_X_train_memory:.4f} MB")
print(f"Memory savings: {feature_selection_savings:.4f} MB ({feature_selection_savings_pct:.1f}%)")