# Machine Learning Model Training and Evaluation

This notebook focuses on training multiple machine learning models for BBB permeability prediction, performing hyperparameter tuning, cross-validation, and comprehensive model comparison.

## Objectives:
- Train multiple ML algorithms (Random Forest, SVM, XGBoost, Neural Networks)
- Perform hyperparameter tuning using grid search
- Conduct stratified cross-validation
- Compare model performance comprehensively
- Select best performing model
- Analyze feature importance

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import joblib
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src')

from data_handler import DataHandler
from feature_engineering import FeatureEngineering
from models import ModelTrainer

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("Libraries imported successfully!")

## 1. Load Dataset with Descriptors

In [None]:
# Load dataset with calculated descriptors
try:
    df = pd.read_csv('../data/BBBP_with_descriptors.csv')
    print(f"Dataset with descriptors loaded: {df.shape}")
except FileNotFoundError:
    print("Dataset with descriptors not found. Please run 02_descriptor_summary.ipynb first.")
    raise

# Identify descriptor columns (exclude metadata columns)
metadata_cols = ['num', 'name', 'smiles', 'p_np', 'mol_object', 'smiles_length']
descriptor_cols = [col for col in df.columns if col not in metadata_cols]

print(f"Descriptor columns: {len(descriptor_cols)}")
print(f"Descriptor names: {descriptor_cols}")

# Check class distribution
class_dist = df['p_np'].value_counts()
print(f"\nClass distribution:")
print(f"Non-permeable (0): {class_dist[0]} ({class_dist[0]/len(df)*100:.1f}%)")
print(f"Permeable (1): {class_dist[1]} ({class_dist[1]/len(df)*100:.1f}%)")

df.head()

## 2. Feature Engineering and Data Preparation

In [None]:
# Initialize feature engineering
feature_eng = FeatureEngineering()

# Prepare feature matrix and target vector
X = df[descriptor_cols].copy()
y = df['p_np'].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Check for missing values
missing_values = X.isnull().sum()
if missing_values.sum() > 0:
    print(f"\nMissing values found:")
    print(missing_values[missing_values > 0])
    
    # Handle missing values
    X = feature_eng.handle_missing_values(X)
    print("Missing values handled using median imputation")
else:
    print("\nNo missing values found")

# Feature selection (remove highly correlated features)
print("\nPerforming feature selection...")
X_selected = feature_eng.select_features(X, correlation_threshold=0.95)
selected_features = X_selected.columns.tolist()

print(f"Features after selection: {len(selected_features)} (removed {len(descriptor_cols) - len(selected_features)})")
print(f"Selected features: {selected_features}")

# Scale features
X_scaled = feature_eng.scale_features(X_selected)
print(f"\nFeatures scaled using StandardScaler")
print(f"Final feature matrix shape: {X_scaled.shape}")

## 3. Train-Test Split

In [None]:
# Create stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Check class distribution in splits
train_dist = pd.Series(y_train).value_counts()
test_dist = pd.Series(y_test).value_counts()

print(f"\nTraining set distribution:")
print(f"Non-permeable: {train_dist[0]} ({train_dist[0]/len(y_train)*100:.1f}%)")
print(f"Permeable: {train_dist[1]} ({train_dist[1]/len(y_train)*100:.1f}%)")

print(f"\nTest set distribution:")
print(f"Non-permeable: {test_dist[0]} ({test_dist[0]/len(y_test)*100:.1f}%)")
print(f"Permeable: {test_dist[1]} ({test_dist[1]/len(y_test)*100:.1f}%)")

## 4. Model Training and Evaluation

In [None]:
# Initialize model trainer
model_trainer = ModelTrainer()

# Dictionary to store trained models and results
models = {}
results = {}

print("Training multiple machine learning models...")
print("This may take several minutes...")

# Define cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### 4.1 Logistic Regression (Baseline)

In [None]:
print("\n=== Training Logistic Regression ===")
lr_model, lr_metrics = model_trainer.train_logistic_regression(X_train, y_train, X_test, y_test)
models['Logistic Regression'] = lr_model
results['Logistic Regression'] = lr_metrics

# Cross-validation
lr_cv_scores = cross_val_score(lr_model, X_train, y_train, cv=cv_strategy, scoring='roc_auc')
results['Logistic Regression']['cv_auc_mean'] = lr_cv_scores.mean()
results['Logistic Regression']['cv_auc_std'] = lr_cv_scores.std()

print(f"Test AUC: {lr_metrics['auc']:.4f}")
print(f"CV AUC: {lr_cv_scores.mean():.4f} ± {lr_cv_scores.std():.4f}")

### 4.2 Random Forest

In [None]:
print("\n=== Training Random Forest ===")
rf_model, rf_metrics = model_trainer.train_random_forest(X_train, y_train, X_test, y_test)
models['Random Forest'] = rf_model
results['Random Forest'] = rf_metrics

# Cross-validation
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=cv_strategy, scoring='roc_auc')
results['Random Forest']['cv_auc_mean'] = rf_cv_scores.mean()
results['Random Forest']['cv_auc_std'] = rf_cv_scores.std()

print(f"Test AUC: {rf_metrics['auc']:.4f}")
print(f"CV AUC: {rf_cv_scores.mean():.4f} ± {rf_cv_scores.std():.4f}")

# Feature importance
rf_feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 5 important features (Random Forest):")
print(rf_feature_importance.head())

### 4.3 Support Vector Machine

In [None]:
print("\n=== Training Support Vector Machine ===")
svm_model, svm_metrics = model_trainer.train_svm(X_train, y_train, X_test, y_test)
models['SVM'] = svm_model
results['SVM'] = svm_metrics

# Cross-validation
svm_cv_scores = cross_val_score(svm_model, X_train, y_train, cv=cv_strategy, scoring='roc_auc')
results['SVM']['cv_auc_mean'] = svm_cv_scores.mean()
results['SVM']['cv_auc_std'] = svm_cv_scores.std()

print(f"Test AUC: {svm_metrics['auc']:.4f}")
print(f"CV AUC: {svm_cv_scores.mean():.4f} ± {svm_cv_scores.std():.4f}")

### 4.4 XGBoost

In [None]:
print("\n=== Training XGBoost ===")
xgb_model, xgb_metrics = model_trainer.train_xgboost(X_train, y_train, X_test, y_test)
models['XGBoost'] = xgb_model
results['XGBoost'] = xgb_metrics

# Cross-validation
xgb_cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv_strategy, scoring='roc_auc')
results['XGBoost']['cv_auc_mean'] = xgb_cv_scores.mean()
results['XGBoost']['cv_auc_std'] = xgb_cv_scores.std()

print(f"Test AUC: {xgb_metrics['auc']:.4f}")
print(f"CV AUC: {xgb_cv_scores.mean():.4f} ± {xgb_cv_scores.std():.4f}")

# Feature importance
xgb_feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 5 important features (XGBoost):")
print(xgb_feature_importance.head())

### 4.5 Neural Network

In [None]:
print("\n=== Training Neural Network ===")
nn_model, nn_metrics = model_trainer.train_neural_network(X_train, y_train, X_test, y_test)
models['Neural Network'] = nn_model
results['Neural Network'] = nn_metrics

# Cross-validation
nn_cv_scores = cross_val_score(nn_model, X_train, y_train, cv=cv_strategy, scoring='roc_auc')
results['Neural Network']['cv_auc_mean'] = nn_cv_scores.mean()
results['Neural Network']['cv_auc_std'] = nn_cv_scores.std()

print(f"Test AUC: {nn_metrics['auc']:.4f}")
print(f"CV AUC: {nn_cv_scores.mean():.4f} ± {nn_cv_scores.std():.4f}")

## 5. Model Comparison and Performance Analysis

In [None]:
# Create comprehensive results dataframe
comparison_data = []
for model_name, metrics in results.items():
    comparison_data.append({
        'Model': model_name,
        'Test_Accuracy': metrics['accuracy'],
        'Test_Precision': metrics['precision'],
        'Test_Recall': metrics['recall'],
        'Test_F1': metrics['f1'],
        'Test_AUC': metrics['auc'],
        'CV_AUC_Mean': metrics['cv_auc_mean'],
        'CV_AUC_Std': metrics['cv_auc_std']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test_AUC', ascending=False)

print("=== MODEL PERFORMANCE COMPARISON ===")
print(comparison_df.round(4))

# Identify best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = models[best_model_name]
print(f"\nBest performing model: {best_model_name}")
print(f"Best model AUC: {comparison_df.iloc[0]['Test_AUC']:.4f}")

## 6. Performance Visualization

In [None]:
# Plot model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# AUC comparison
axes[0, 0].bar(comparison_df['Model'], comparison_df['Test_AUC'], alpha=0.7)
axes[0, 0].set_title('Test AUC Comparison')
axes[0, 0].set_ylabel('AUC Score')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# Accuracy comparison
axes[0, 1].bar(comparison_df['Model'], comparison_df['Test_Accuracy'], alpha=0.7, color='orange')
axes[0, 1].set_title('Test Accuracy Comparison')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# F1 Score comparison
axes[1, 0].bar(comparison_df['Model'], comparison_df['Test_F1'], alpha=0.7, color='green')
axes[1, 0].set_title('Test F1 Score Comparison')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

# Cross-validation AUC with error bars
axes[1, 1].bar(comparison_df['Model'], comparison_df['CV_AUC_Mean'], 
               yerr=comparison_df['CV_AUC_Std'], alpha=0.7, color='red', capsize=5)
axes[1, 1].set_title('Cross-Validation AUC')
axes[1, 1].set_ylabel('CV AUC (Mean ± Std)')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. ROC Curves Comparison

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

colors = ['blue', 'red', 'green', 'orange', 'purple']
for i, (model_name, model) in enumerate(models.items()):
    # Get predictions
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test)
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, color=colors[i], lw=2, 
             label=f'{model_name} (AUC = {auc_score:.3f})')

# Plot diagonal line
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

## 8. Confusion Matrices

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, (model_name, model) in enumerate(models.items()):
    if i < len(axes):
        # Get predictions
        y_pred = model.predict(X_test)
        
        # Calculate confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                   xticklabels=['Non-permeable', 'Permeable'],
                   yticklabels=['Non-permeable', 'Permeable'])
        axes[i].set_title(f'{model_name}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')

# Hide unused subplot
if len(models) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.show()

## 9. Feature Importance Analysis

In [None]:
# Compare feature importance from tree-based models
if 'Random Forest' in models and 'XGBoost' in models:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Random Forest feature importance
    rf_top_features = rf_feature_importance.head(10)
    ax1.barh(range(len(rf_top_features)), rf_top_features['importance'], alpha=0.7)
    ax1.set_yticks(range(len(rf_top_features)))
    ax1.set_yticklabels(rf_top_features['feature'])
    ax1.set_xlabel('Feature Importance')
    ax1.set_title('Random Forest - Top 10 Features')
    ax1.grid(True, alpha=0.3)
    
    # XGBoost feature importance
    xgb_top_features = xgb_feature_importance.head(10)
    ax2.barh(range(len(xgb_top_features)), xgb_top_features['importance'], alpha=0.7, color='orange')
    ax2.set_yticks(range(len(xgb_top_features)))
    ax2.set_yticklabels(xgb_top_features['feature'])
    ax2.set_xlabel('Feature Importance')
    ax2.set_title('XGBoost - Top 10 Features')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Compare top features between models
    print("\n=== FEATURE IMPORTANCE COMPARISON ===")
    print("\nTop 5 features - Random Forest:")
    print(rf_feature_importance.head())
    print("\nTop 5 features - XGBoost:")
    print(xgb_feature_importance.head())
    
    # Find common important features
    rf_top5 = set(rf_feature_importance.head()['feature'])
    xgb_top5 = set(xgb_feature_importance.head()['feature'])
    common_features = rf_top5.intersection(xgb_top5)
    
    print(f"\nCommon top features: {common_features}")

## 10. Model Persistence

In [None]:
# Save the best model and preprocessing objects
print(f"Saving best model: {best_model_name}")

# Create results directory if it doesn't exist
os.makedirs('../results/models', exist_ok=True)

# Save best model
joblib.dump(best_model, f'../results/models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl')

# Save feature scaler
joblib.dump(feature_eng.scaler, '../results/models/feature_scaler.pkl')

# Save selected features list
with open('../results/models/selected_features.txt', 'w') as f:
    for feature in selected_features:
        f.write(f"{feature}\n")

# Save model comparison results
comparison_df.to_csv('../results/models/model_comparison.csv', index=False)

print("Model artifacts saved successfully!")
print(f"- Best model: ../results/models/best_model_{best_model_name.lower().replace(' ', '_')}.pkl")
print(f"- Feature scaler: ../results/models/feature_scaler.pkl")
print(f"- Selected features: ../results/models/selected_features.txt")
print(f"- Model comparison: ../results/models/model_comparison.csv")

## 11. Model Training Summary

In [None]:
print("=== MODEL TRAINING SUMMARY ===")
print(f"\nDataset: {len(df)} compounds")
print(f"Features: {len(selected_features)} (after selection from {len(descriptor_cols)})")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

print(f"\n=== MODEL PERFORMANCE RANKING ===")
for i, row in comparison_df.iterrows():
    print(f"{i+1}. {row['Model']:15s} - AUC: {row['Test_AUC']:.4f} (CV: {row['CV_AUC_Mean']:.4f}±{row['CV_AUC_Std']:.4f})")

print(f"\n=== BEST MODEL: {best_model_name} ===")
best_results = results[best_model_name]
print(f"Test Accuracy: {best_results['accuracy']:.4f}")
print(f"Test Precision: {best_results['precision']:.4f}")
print(f"Test Recall: {best_results['recall']:.4f}")
print(f"Test F1-Score: {best_results['f1']:.4f}")
print(f"Test AUC: {best_results['auc']:.4f}")
print(f"CV AUC: {best_results['cv_auc_mean']:.4f} ± {best_results['cv_auc_std']:.4f}")

if best_model_name in ['Random Forest', 'XGBoost']:
    if best_model_name == 'Random Forest':
        top_features = rf_feature_importance.head(3)
    else:
        top_features = xgb_feature_importance.head(3)
    
    print(f"\nTop 3 most important features:")
    for _, row in top_features.iterrows():
        print(f"- {row['feature']}: {row['importance']:.4f}")

print("\n=== RECOMMENDATIONS ===")
if best_results['auc'] > 0.90:
    print("- Excellent model performance (AUC > 0.90)")
elif best_results['auc'] > 0.80:
    print("- Good model performance (AUC > 0.80)")
else:
    print("- Model performance could be improved")
    print("- Consider feature engineering or ensemble methods")

cv_stability = best_results['cv_auc_std']
if cv_stability < 0.05:
    print("- Model shows good stability across CV folds")
else:
    print("- Model shows some variability across CV folds")
    print("- Consider regularization or more data")

print("\n- Proceed to notebook 04_interpretation.ipynb for model interpretability analysis")
print("- Use saved model for making predictions on new compounds")