# Model Training Pipeline Component

This notebook demonstrates model training as part of an Elyra pipeline.
It receives processed data from the previous component and trains multiple models.

## Parameters and Environment Setup

In [None]:
# Parameters - configurable in Elyra pipeline editor
train_data_path = "/tmp/train_data.csv"
test_data_path = "/tmp/test_data.csv"
model_output_path = "/tmp/models"
experiment_name = "elyra_demo_experiment"
cross_validation_folds = 5
random_seed = 42

In [None]:
import pandas as pd
import numpy as np
import joblib
import json
import os
from datetime import datetime
import logging

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, classification_report, confusion_matrix
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create output directory
os.makedirs(model_output_path, exist_ok=True)

## Data Loading

In [None]:
def load_training_data(train_path, test_path):
    """Load training and test data."""
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        logger.info(f"Training data shape: {train_df.shape}")
        logger.info(f"Test data shape: {test_df.shape}")
        
        # Separate features and targets
        X_train = train_df.drop('target', axis=1)
        y_train = train_df['target']
        X_test = test_df.drop('target', axis=1)
        y_test = test_df['target']
        
        return X_train, X_test, y_train, y_test
    
    except Exception as e:
        logger.error(f"Error loading training data: {str(e)}")
        raise

# Load data
X_train, X_test, y_train, y_test = load_training_data(train_data_path, test_data_path)

print(f"Features: {list(X_train.columns)}")
print(f"Target distribution in training set:")
print(y_train.value_counts(normalize=True))

## Model Configuration and Training

In [None]:
def define_models():
    """Define multiple models for comparison."""
    models = {
        'logistic_regression': {
            'model': LogisticRegression(random_state=random_seed, max_iter=1000),
            'params': {
                'C': [0.1, 1.0, 10.0]
            }
        },
        'random_forest': {
            'model': RandomForestClassifier(random_state=random_seed),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [None, 10]
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingClassifier(random_state=random_seed),
            'params': {
                'n_estimators': [50, 100],
                'learning_rate': [0.1, 0.2]
            }
        }
    }
    
    return models

models_config = define_models()
print(f"Configured {len(models_config)} models for training")

In [None]:
def train_and_evaluate_models(models_config, X_train, y_train, X_test, y_test):
    """Train multiple models and evaluate their performance."""
    results = {}
    trained_models = {}
    
    for model_name, config in models_config.items():
        logger.info(f"Training {model_name}...")
        
        try:
            # Perform grid search with cross-validation
            grid_search = GridSearchCV(
                config['model'],
                config['params'],
                cv=cross_validation_folds,
                scoring='f1',
                n_jobs=-1
            )
            
            # Fit the model
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            
            # Make predictions
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics
            metrics = {
                'accuracy': float(accuracy_score(y_test, y_pred)),
                'precision': float(precision_score(y_test, y_pred)),
                'recall': float(recall_score(y_test, y_pred)),
                'f1_score': float(f1_score(y_test, y_pred)),
                'roc_auc': float(roc_auc_score(y_test, y_pred_proba)),
                'best_params': grid_search.best_params_,
                'cv_score_mean': float(grid_search.best_score_)
            }
            
            results[model_name] = metrics
            trained_models[model_name] = best_model
            
            logger.info(f"{model_name} - F1 Score: {metrics['f1_score']:.4f}")
            
        except Exception as e:
            logger.error(f"Error training {model_name}: {str(e)}")
            continue
    
    return results, trained_models

# Train all models
model_results, trained_models = train_and_evaluate_models(
    models_config, X_train, y_train, X_test, y_test
)

print("\nModel Training Completed!")
print("\nModel Performance Summary:")
for model_name, metrics in model_results.items():
    print(f"\n{model_name.upper()}:")
    print(f"  F1 Score: {metrics['f1_score']:.4f}")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  ROC AUC: {metrics['roc_auc']:.4f}")

## Model Selection and Saving

In [None]:
def select_best_model(model_results, trained_models, metric='f1_score'):
    """Select the best model based on specified metric."""
    best_model_name = max(model_results.keys(), key=lambda x: model_results[x][metric])
    best_model = trained_models[best_model_name]
    best_score = model_results[best_model_name][metric]
    
    logger.info(f"Best model: {best_model_name} ({metric}: {best_score:.4f})")
    
    return best_model_name, best_model, best_score

# Select best model
best_model_name, best_model, best_score = select_best_model(model_results, trained_models)

print(f"Selected Best Model: {best_model_name}")
print(f"Best F1 Score: {best_score:.4f}")

In [None]:
def save_models_and_results(trained_models, model_results, best_model_name, output_path):
    """Save all trained models and results."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save all trained models
    models_dir = f"{output_path}/models_{timestamp}"
    os.makedirs(models_dir, exist_ok=True)
    
    for model_name, model in trained_models.items():
        model_path = f"{models_dir}/{model_name}.pkl"
        joblib.dump(model, model_path)
        logger.info(f"Saved {model_name} to {model_path}")
    
    # Save the best model separately for easy access
    best_model_path = f"{output_path}/best_model.pkl"
    joblib.dump(trained_models[best_model_name], best_model_path)
    
    # Save model metadata
    metadata = {
        'timestamp': timestamp,
        'best_model': best_model_name,
        'best_model_path': best_model_path,
        'experiment_name': experiment_name,
        'training_data_shape': [int(x) for x in X_train.shape],
        'test_data_shape': [int(x) for x in X_test.shape],
        'features': list(X_train.columns),
        'random_seed': random_seed
    }
    
    metadata_path = f"{output_path}/model_metadata.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    # Save detailed results
    results_path = f"{output_path}/model_results.json"
    with open(results_path, 'w') as f:
        json.dump(model_results, f, indent=2)
    
    return {
        'best_model_path': best_model_path,
        'metadata_path': metadata_path,
        'results_path': results_path,
        'models_dir': models_dir
    }

# Save models and results
saved_artifacts = save_models_and_results(
    trained_models, model_results, best_model_name, model_output_path
)

print("\nModel artifacts saved:")
for key, path in saved_artifacts.items():
    print(f"  {key}: {path}")

## Pipeline Output Summary

In [None]:
# Pipeline outputs for next components
pipeline_outputs = {
    'best_model': saved_artifacts['best_model_path'],
    'model_metadata': saved_artifacts['metadata_path'],
    'model_results': saved_artifacts['results_path'],
    'all_models_dir': saved_artifacts['models_dir'],
    'best_model_name': best_model_name,
    'best_model_score': float(best_score)
}

print("Model Training Pipeline Component Completed Successfully!")
print(f"\nBest Model: {best_model_name}")
print(f"Best F1 Score: {best_score:.4f}")

print("\nOutputs available for next components:")
for key, value in pipeline_outputs.items():
    if isinstance(value, str) and os.path.exists(value):
        print(f"  {key}: ✓ {value}")
    else:
        print(f"  {key}: {value}")

# Model readiness check for deployment
print("\nModel Deployment Readiness:")
print(f"✓ Model trained and validated")
print(f"✓ Performance metrics calculated")
print(f"✓ Model artifacts saved")
print(f"✓ Ready for evaluation stage")