# NeuroSmriti - Alzheimer's Detection Model Training

This notebook trains machine learning models on the 420K+ synthetic dataset to predict Alzheimer's disease stages.

## Models:
1. Random Forest Classifier
2. Gradient Boosting (XGBoost)
3. Neural Network (MLP)
4. Ensemble Voting Classifier

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn xgboost matplotlib seaborn joblib tqdm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
import joblib
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

try:
    import xgboost as xgb
    HAS_XGB = True
    print("XGBoost available!")
except ImportError:
    HAS_XGB = False
    print("XGBoost not available, using GradientBoosting")

print("Libraries loaded successfully!")

## 1. Load Dataset

In [None]:
# Load the dataset
data_path = '../data/alzheimers_420k_dataset.csv'

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df):,} records from {data_path}")
else:
    print("Dataset not found. Please run notebook 01_generate_dataset.ipynb first.")

df.head()

In [None]:
print(f"Dataset shape: {df.shape}")
print(f"\nColumn types:")
print(df.dtypes)

## 2. Prepare Features

In [None]:
# Define feature columns
feature_cols = [
    'age', 'education_years', 'mmse_total', 'moca_total',
    'hippocampus_volume', 'entorhinal_volume', 'total_brain_volume',
    'csf_abeta42', 'csf_total_tau', 'csf_ptau181',
    'amyloid_pet_suvr', 'tau_pet_suvr'
]

# Add binary features
binary_cols = ['has_apoe4', 'family_history_ad', 'amyloid_positive', 'tau_positive',
               'hypertension', 'diabetes', 'depression']

# Convert boolean to int
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].astype(int)

# Add gender as binary
df['gender_female'] = (df['gender'] == 'Female').astype(int)

all_features = feature_cols + binary_cols + ['gender_female']
available_features = [f for f in all_features if f in df.columns]

print(f"Using {len(available_features)} features:")
print(available_features)

In [None]:
# Prepare X and y
X = df[available_features].fillna(0).values
y = df['diagnosis_stage'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Classes: {label_encoder.classes_}")
print(f"Encoded: {np.unique(y_encoded)}")
print(f"\nClass distribution:")
for i, cls in enumerate(label_encoder.classes_):
    count = np.sum(y_encoded == i)
    print(f"  {cls}: {count:,} ({count/len(y)*100:.1f}%)")

In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")
print(f"Feature means (should be ~0): {X_scaled.mean(axis=0)[:5]}")
print(f"Feature stds (should be ~1): {X_scaled.std(axis=0)[:5]}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

## 3. Train Models

In [None]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        n_jobs=-1,
        random_state=42
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=150,
        max_depth=8,
        learning_rate=0.1,
        random_state=42
    ),
    'Neural Network': MLPClassifier(
        hidden_layer_sizes=(128, 64, 32),
        activation='relu',
        solver='adam',
        max_iter=500,
        early_stopping=True,
        random_state=42
    )
}

if HAS_XGB:
    models['XGBoost'] = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    )

print(f"Training {len(models)} models...")

In [None]:
# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")
    print('='*50)
    
    start_time = datetime.now()
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # ROC AUC
    if y_pred_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
        except:
            roc_auc = None
    else:
        roc_auc = None
    
    training_time = (datetime.now() - start_time).total_seconds()
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'training_time': training_time,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    if roc_auc:
        print(f"  ROC AUC:   {roc_auc:.4f}")
    print(f"  Time:      {training_time:.2f}s")

In [None]:
# Create Ensemble Model
print("\n" + "="*50)
print("Training Ensemble Model...")
print("="*50)

estimators = [(name, results[name]['model']) for name in models.keys()]
ensemble = VotingClassifier(estimators=estimators, voting='soft')

start_time = datetime.now()
ensemble.fit(X_train, y_train)

y_pred_ensemble = ensemble.predict(X_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average='weighted')
training_time = (datetime.now() - start_time).total_seconds()

results['Ensemble'] = {
    'model': ensemble,
    'accuracy': accuracy_ensemble,
    'f1_score': f1_ensemble,
    'training_time': training_time,
    'y_pred': y_pred_ensemble
}

print(f"  Accuracy: {accuracy_ensemble:.4f}")
print(f"  F1 Score: {f1_ensemble:.4f}")

## 4. Model Comparison

In [None]:
# Compare all models
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [r['accuracy'] for r in results.values()],
    'F1 Score': [r['f1_score'] for r in results.values()],
    'Training Time (s)': [r['training_time'] for r in results.values()]
})

comparison_df = comparison_df.sort_values('F1 Score', ascending=False)
comparison_df

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
ax1 = axes[0]
colors = plt.cm.viridis(np.linspace(0, 0.8, len(results)))
bars = ax1.bar(comparison_df['Model'], comparison_df['Accuracy'], color=colors)
ax1.set_title('Model Accuracy Comparison', fontsize=14)
ax1.set_ylabel('Accuracy')
ax1.set_ylim(0.8, 1.0)
ax1.tick_params(axis='x', rotation=45)

# Add value labels
for bar, val in zip(bars, comparison_df['Accuracy']):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{val:.3f}', ha='center', va='bottom', fontsize=10)

# F1 Score comparison
ax2 = axes[1]
bars = ax2.bar(comparison_df['Model'], comparison_df['F1 Score'], color=colors)
ax2.set_title('Model F1 Score Comparison', fontsize=14)
ax2.set_ylabel('F1 Score')
ax2.set_ylim(0.8, 1.0)
ax2.tick_params(axis='x', rotation=45)

for bar, val in zip(bars, comparison_df['F1 Score']):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{val:.3f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('../data/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Confusion Matrices

In [None]:
# Plot confusion matrices for top models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

for ax, (name, result) in zip(axes.flatten(), list(results.items())[:4]):
    cm = confusion_matrix(y_test, result['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    ax.set_title(f'{name} Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.savefig('../data/confusion_matrices.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Feature Importance

In [None]:
# Get feature importance from Random Forest
rf_model = results['Random Forest']['model']
importance = pd.DataFrame({
    'Feature': available_features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot top features
plt.figure(figsize=(10, 8))
top_n = 15
colors = plt.cm.viridis(np.linspace(0.2, 0.8, top_n))
plt.barh(importance['Feature'][:top_n][::-1], importance['Importance'][:top_n][::-1], color=colors)
plt.xlabel('Importance')
plt.title('Top 15 Most Important Features (Random Forest)')
plt.tight_layout()
plt.savefig('../data/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("Top 10 Features:")
print(importance.head(10).to_string(index=False))

## 7. Cross-Validation

In [None]:
# 5-fold cross-validation
print("5-Fold Cross Validation Results:")
print("="*50)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {}
for name in ['Random Forest', 'Gradient Boosting', 'Neural Network']:
    model = results[name]['model']
    scores = cross_val_score(model, X_scaled, y_encoded, cv=skf, scoring='accuracy', n_jobs=-1)
    cv_results[name] = scores
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

In [None]:
# Boxplot of CV scores
plt.figure(figsize=(10, 6))
cv_df = pd.DataFrame(cv_results)
cv_df.boxplot()
plt.title('5-Fold Cross-Validation Scores')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('../data/cv_scores.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Save Models

In [None]:
# Create models directory
os.makedirs('../models', exist_ok=True)

# Save models
for name, result in results.items():
    filename = f"../models/{name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(result['model'], filename)
    print(f"Saved: {filename}")

# Save scaler and label encoder
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(label_encoder, '../models/label_encoder.pkl')

# Save feature list
import json
with open('../models/feature_list.json', 'w') as f:
    json.dump(available_features, f)

print("\nAll models and preprocessors saved!")

In [None]:
# Save training report
report = []
report.append("="*70)
report.append("NEUROSMRITI - ALZHEIMER'S DETECTION MODEL TRAINING REPORT")
report.append("="*70)
report.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"Dataset Size: {len(df):,} records")
report.append(f"Training Set: {len(X_train):,} samples")
report.append(f"Test Set: {len(X_test):,} samples")
report.append(f"Features: {len(available_features)}")
report.append("\n" + "="*70)
report.append("MODEL PERFORMANCE")
report.append("="*70)

for name, result in results.items():
    report.append(f"\n{name}:")
    report.append(f"  Accuracy:  {result['accuracy']:.4f}")
    report.append(f"  F1 Score:  {result['f1_score']:.4f}")
    report.append(f"  Time:      {result['training_time']:.2f}s")

# Best model
best_model = max(results.items(), key=lambda x: x[1]['f1_score'])
report.append("\n" + "="*70)
report.append(f"BEST MODEL: {best_model[0]}")
report.append(f"F1 Score: {best_model[1]['f1_score']:.4f}")
report.append("="*70)

report_text = "\n".join(report)
print(report_text)

with open('../models/training_report.txt', 'w') as f:
    f.write(report_text)

print("\nReport saved to ../models/training_report.txt")

## Summary

Training complete! Models saved to `../models/` directory.