# Model Evaluation and Hyperparameter Tuning for Pediatric Appendicitis Diagnosis (Part 1)

This notebook provides a comprehensive evaluation of different machine learning models for pediatric appendicitis diagnosis, focusing on:
- Model selection and comparison
- Performance evaluation with various metrics
- Hyperparameter tuning
- Cross-validation

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, roc_curve, precision_recall_curve, average_precision_score,
    classification_report
)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import catboost as cb
import joblib
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Add project root to path
sys.path.append('..')

# Import project modules
from src.data_processing.preprocess import load_data, handle_missing_values, optimize_memory

# Set plot styling
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 14

## 1. Data Loading and Preparation

Let's start by loading and preparing our dataset for model training and evaluation.

In [None]:
print("Loading and preparing data...")

# Load synthetic data
data_path = '../DATA/synthetic_appendicitis_data.csv'
df = pd.read_csv(data_path)

# Prepare features and target
X = df[['Age', 'Temperature', 'WBC', 'CRP', 'Pain_Duration', 'Neutrophil_Percent']]
y = df['Appendicitis']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
print(f"Class distribution in training set: {np.bincount(y_train)}")
print(f"Class distribution in test set: {np.bincount(y_test)}")

## 2. Model Selection and Initial Evaluation

Let's evaluate various classification models to determine which ones perform best for our appendicitis diagnosis task.

In [None]:
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'CatBoost': cb.CatBoostClassifier(random_state=42, verbose=0)
}

# Function to evaluate and compare models
def evaluate_models(models, X_train, X_test, y_train, y_test):
    """Evaluate multiple models and return their performance metrics."""
    results = []
    
    for name, model in models.items():
        print(f"\nEvaluating {name}...")
        start_time = time.time()
        
        # Train model
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
        cv_mean = np.mean(cv_scores)
        cv_std = np.std(cv_scores)
        
        # Training time
        train_time = time.time() - start_time
        
        # Store results
        results.append({
            'Model': name,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'AUC': auc,
            'CV AUC Mean': cv_mean,
            'CV AUC Std': cv_std,
            'Training Time': train_time
        })
        
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  AUC: {auc:.4f}")
        print(f"  CV AUC: {cv_mean:.4f} ± {cv_std:.4f}")
        print(f"  Training Time: {train_time:.4f} seconds")
    
    return pd.DataFrame(results)

In [None]:
# Evaluate all models
results_df = evaluate_models(models, X_train_scaled, X_test_scaled, y_train, y_test)
print("\nModel Comparison Results:")
print(results_df)

In [None]:
# Plot model performance comparison
plt.figure(figsize=(14, 8))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC']
results_df.set_index('Model')[metrics].plot(kind='bar', ax=plt.gca())
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.legend(title='Metric')
plt.tight_layout()
plt.savefig('../figures/model_performance_comparison.png')
plt.show()

In [None]:
# Plot training time comparison
plt.figure(figsize=(12, 6))
plt.bar(results_df['Model'], results_df['Training Time'], color=sns.color_palette('viridis', len(results_df)))
plt.title('Model Training Time Comparison')
plt.ylabel('Training Time (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../figures/model_training_time_comparison.png')
plt.show()

## 3. Detailed Evaluation of the Best Model

Let's analyze the best performing model in more detail.

In [None]:
# Identify the best model based on AUC
best_model_idx = results_df['AUC'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_model = models[best_model_name]

print(f"\nBest model: {best_model_name} with AUC {results_df.loc[best_model_idx, 'AUC']:.4f}")

# Generate detailed evaluation for the best model
y_pred = best_model.predict(X_test_scaled)
y_prob = best_model.predict_proba(X_test_scaled)[:, 1]

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('../figures/confusion_matrix_best_model.png')
plt.show()

In [None]:
# Plot ROC curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, lw=2, label=f'{best_model_name} (AUC = {roc_auc_score(y_test, y_prob):.4f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('../figures/roc_curve_best_model.png')
plt.show()

In [None]:
# Plot Precision-Recall curve
plt.figure(figsize=(10, 8))
precision, recall, _ = precision_recall_curve(y_test, y_prob)
average_precision = average_precision_score(y_test, y_prob)
plt.plot(recall, precision, lw=2, label=f'{best_model_name} (AP = {average_precision:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='upper right')
plt.grid(True)
plt.savefig('../figures/precision_recall_curve_best_model.png')
plt.show()

## 4. Hyperparameter Tuning with Grid Search

Now let's perform hyperparameter tuning on the best model to optimize its performance.

In [None]:
print("\nPerforming hyperparameter tuning for the best model...")

# Define hyperparameter grid based on best model
if best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
elif best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
    }
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'subsample': [0.8, 0.9, 1.0]
    }
elif best_model_name == 'LightGBM':
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7, -1],
        'num_leaves': [31, 50, 100],
        'subsample': [0.8, 0.9, 1.0]
    }
elif best_model_name == 'CatBoost':
    param_grid = {
        'iterations': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [4, 6, 8],
        'l2_leaf_reg': [1, 3, 5, 10],
        'border_count': [32, 64, 128]
    }

# Setup grid search with cross-validation
grid_search = GridSearchCV(
    best_model,
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

# Perform grid search
start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
tuning_time = time.time() - start_time

print(f"\nHyperparameter tuning completed in {tuning_time:.2f} seconds")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate the tuned model
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test_scaled)
y_prob_tuned = tuned_model.predict_proba(X_test_scaled)[:, 1]

accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)
auc_tuned = roc_auc_score(y_test, y_prob_tuned)

print("\nTuned Model Performance:")
print(f"Accuracy: {accuracy_tuned:.4f}")
print(f"Precision: {precision_tuned:.4f}")
print(f"Recall: {recall_tuned:.4f}")
print(f"F1 Score: {f1_tuned:.4f}")
print(f"AUC: {auc_tuned:.4f}")

In [None]:
# Compare original vs tuned model
original_metrics = [
    results_df.loc[best_model_idx, 'Accuracy'],
    results_df.loc[best_model_idx, 'Precision'],
    results_df.loc[best_model_idx, 'Recall'],
    results_df.loc[best_model_idx, 'F1'],
    results_df.loc[best_model_idx, 'AUC']
]

tuned_metrics = [accuracy_tuned, precision_tuned, recall_tuned, f1_tuned, auc_tuned]
metrics_labels = ['Accuracy', 'Precision', 'Recall', 'F1', 'AUC']

# Plot comparison
plt.figure(figsize=(12, 8))
x = np.arange(len(metrics_labels))
width = 0.35

plt.bar(x - width/2, original_metrics, width, label='Original Model')
plt.bar(x + width/2, tuned_metrics, width, label='Tuned Model')

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title(f'Performance Comparison: Original vs Tuned {best_model_name}')
plt.xticks(x, metrics_labels)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../figures/tuned_model_comparison.png')
plt.show()

## 5. Save the Tuned Model

Let's save the best tuned model for future use.

In [None]:
# Save the tuned model
model_path = f'../models/tuned_{best_model_name.lower().replace(" ", "_")}_model.pkl'
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(tuned_model, model_path)
print(f"\nTuned model saved to {model_path}")

# Also save the scaler for future use
scaler_path = '../models/standard_scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")

print("\nPart 1 of model evaluation completed. See Part 2 for advanced evaluation techniques.")