# 03 - Model Training with Feature Store

This notebook demonstrates:
- Creating training datasets from the feature store
- Training ML models for treatment response prediction
- Evaluating model performance
- Understanding feature importance
- Benefits of feature store for ML workflows

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from feature_store import FeatureStore
from utils import split_train_test, calculate_feature_importance

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("Libraries imported successfully!")

## 1. Connect to Feature Store

In [None]:
# Initialize feature store
fs = FeatureStore(
    db_path='../data/feature_store.duckdb',
    config_dir='../config'
)

print("Connected to feature store!")

## 2. Create Training Dataset

Select features for predicting treatment response.

In [None]:
# Define feature set for model
feature_list = [
    'age_scaled',
    'tmb_score_scaled',
    'mutation_burden',
    'clinical_risk_score',
    'wbc_imputed',
    'hemoglobin_imputed',
    'platelet_imputed'
]

# Create training dataset
# Target: response_status (1 = responder, 0 = non-responder)
training_data = fs.create_training_dataset(
    feature_list=feature_list,
    target='response_status',
    include_metadata=False
)

print(f"Training dataset shape: {training_data.shape}")
print(f"Features: {feature_list}")
print(f"Target: response_status")
print(f"\nClass balance: {training_data['response_status'].value_counts(normalize=True)}")

training_data.head()

## 3. Split Data

Split into train and test sets with stratification to maintain class balance.

In [None]:
# Separate features and target
X = training_data[feature_list]
y = training_data['response_status']

# Split with stratification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTrain class balance: {y_train.value_counts(normalize=True)}")
print(f"Test class balance: {y_test.value_counts(normalize=True)}")

## 4. Train Models

Train multiple models and compare performance.

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    
    # Store results
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'cv_auc': cv_scores.mean(),
        'test_auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    print(f"  CV AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
    print(f"  Test AUC: {results[name]['test_auc']:.3f}")

## 5. Compare Model Performance

In [None]:
# Plot ROC curves
fig, ax = plt.subplots(figsize=(10, 8))

for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
    ax.plot(fpr, tpr, label=f"{name} (AUC={result['test_auc']:.3f})")

ax.plot([0, 1], [0, 1], 'k--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves - Treatment Response Prediction')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Select best model (highest test AUC)
best_model_name = max(results.items(), key=lambda x: x[1]['test_auc'])[0]
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['y_pred']

print(f"Best Model: {best_model_name}")
print(f"Test AUC: {results[best_model_name]['test_auc']:.3f}")
print(f"\nClassification Report:")
print(classification_report(y_test, best_predictions, target_names=['Non-Responder', 'Responder']))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, best_predictions)

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Non-Responder', 'Responder'],
            yticklabels=['Non-Responder', 'Responder'])
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')
ax.set_title(f'Confusion Matrix - {best_model_name}')
plt.tight_layout()
plt.show()

## 6. Feature Importance

Understand which features are most predictive of treatment response.

In [None]:
# Get feature importance (using Random Forest for interpretability)
rf_model = results['Random Forest']['model']
importance_df = calculate_feature_importance(rf_model, feature_list)

print("Feature Importance (Random Forest):")
print(importance_df)

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(importance_df['feature'], importance_df['importance'])
ax.set_xlabel('Importance')
ax.set_title('Feature Importance for Treatment Response Prediction')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 7. Feature Store Benefits

### Key Benefits Demonstrated:

1. **Centralized Features**: All features computed once and stored for reuse
2. **Consistent Transformations**: Same features used in training and future predictions
3. **Version Control**: Can track which feature version was used for each model
4. **Lineage Tracking**: Know which raw data created which features
5. **Quality Assurance**: Validation ensures only clean features are used
6. **Easy Experimentation**: Can quickly try different feature combinations

### Production ML Workflow:

```
Raw Data → Validation → Feature Store → Training Dataset → Model
    ↓                         ↓              ↓               ↓
Quality Checks         Feature Versioning  Reproducibility  Deployment
```

## 8. Create Another Model with Different Features

Show how easy it is to experiment with the feature store.

In [None]:
# Try a genomics-focused model
genomic_features = [
    'mutation_burden',
    'tmb_score_scaled',
    'age_scaled'
]

# Create dataset (instantly, features already computed!)
genomic_data = fs.create_training_dataset(
    feature_list=genomic_features,
    target='response_status',
    include_metadata=False
)

X_genomic = genomic_data[genomic_features]
y_genomic = genomic_data['response_status']

# Train simple model
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
    X_genomic, y_genomic, test_size=0.2, random_state=42, stratify=y_genomic
)

rf_genomic = RandomForestClassifier(n_estimators=100, random_state=42)
rf_genomic.fit(X_train_g, y_train_g)
y_pred_g = rf_genomic.predict_proba(X_test_g)[:, 1]

auc_genomic = roc_auc_score(y_test_g, y_pred_g)
print(f"Genomics-only model AUC: {auc_genomic:.3f}")
print(f"Full feature model AUC: {results['Random Forest']['test_auc']:.3f}")
print(f"\nImprovement from adding clinical features: {(results['Random Forest']['test_auc'] - auc_genomic):.3f}")

## Summary

In this notebook we:
1. Created training datasets from the feature store
2. Trained and compared multiple ML models
3. Achieved strong predictive performance for treatment response
4. Understood feature importance
5. Demonstrated feature store benefits for ML workflows

Next: `04_monitoring_report.ipynb` - Monitor data quality over time!

In [None]:
# Close feature store
fs.close()

In [None]:
## End of Notebook ##