# Model Evaluation and Validation Pipeline Component

This notebook demonstrates comprehensive model evaluation and validation as part of the Elyra MLOps pipeline.
It performs advanced model analysis and prepares models for production deployment.

## Parameters and Setup

In [None]:
# Parameters
best_model_path = "/tmp/models/best_model.pkl"
test_data_path = "/tmp/test_data.csv"
model_metadata_path = "/tmp/models/model_metadata.json"
evaluation_output_path = "/tmp/evaluation"
performance_threshold = 0.8  # Minimum F1 score for production deployment
model_registry_path = "/tmp/model_registry"

In [None]:
import pandas as pd
import numpy as np
import joblib
import json
import os
from datetime import datetime
import logging

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, precision_recall_curve, confusion_matrix, classification_report
)
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create output directories
os.makedirs(evaluation_output_path, exist_ok=True)
os.makedirs(model_registry_path, exist_ok=True)

## Load Model and Data

In [None]:
def load_model_and_data():
    """Load the trained model and test data."""
    try:
        # Load model
        model = joblib.load(best_model_path)
        logger.info(f"Model loaded: {type(model).__name__}")
        
        # Load test data
        test_df = pd.read_csv(test_data_path)
        X_test = test_df.drop('target', axis=1)
        y_test = test_df['target']
        
        # Load metadata
        with open(model_metadata_path, 'r') as f:
            metadata = json.load(f)
        
        logger.info(f"Test data shape: {X_test.shape}")
        
        return model, X_test, y_test, metadata
    
    except Exception as e:
        logger.error(f"Error loading model and data: {str(e)}")
        raise

# Load everything
model, X_test, y_test, metadata = load_model_and_data()

print(f"Model Type: {type(model).__name__}")
print(f"Features: {list(X_test.columns)}")
print(f"Test samples: {len(X_test)}")
print(f"Model metadata: {metadata['best_model']}")

## Comprehensive Model Evaluation

In [None]:
def comprehensive_evaluation(model, X_test, y_test):
    """Perform comprehensive model evaluation."""
    
    # Basic predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate all metrics
    metrics = {
        'accuracy': float(accuracy_score(y_test, y_pred)),
        'precision': float(precision_score(y_test, y_pred)),
        'recall': float(recall_score(y_test, y_pred)),
        'f1_score': float(f1_score(y_test, y_pred)),
        'roc_auc': float(roc_auc_score(y_test, y_pred_proba))
    }
    
    # Detailed classification report
    class_report = classification_report(y_test, y_pred, output_dict=True)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # ROC curve data
    fpr, tpr, roc_thresholds = roc_curve(y_test, y_pred_proba)
    
    # Precision-Recall curve data
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_test, y_pred_proba)
    
    evaluation_results = {
        'metrics': metrics,
        'classification_report': class_report,
        'confusion_matrix': cm.tolist(),
        'roc_curve': {
            'fpr': fpr.tolist(),
            'tpr': tpr.tolist(),
            'thresholds': roc_thresholds.tolist()
        },
        'precision_recall_curve': {
            'precision': precision_curve.tolist(),
            'recall': recall_curve.tolist(),
            'thresholds': pr_thresholds.tolist()
        }
    }
    
    return evaluation_results, y_pred, y_pred_proba

# Perform evaluation
eval_results, y_pred, y_pred_proba = comprehensive_evaluation(model, X_test, y_test)

print("Comprehensive Evaluation Results:")
print("=" * 40)
for metric, value in eval_results['metrics'].items():
    print(f"{metric.upper()}: {value:.4f}")

print("\nConfusion Matrix:")
print(np.array(eval_results['confusion_matrix']))

## Model Performance Validation

In [None]:
def validate_model_performance(eval_results, threshold=0.8):
    """Validate model meets performance requirements."""
    
    f1_score = eval_results['metrics']['f1_score']
    accuracy = eval_results['metrics']['accuracy']
    roc_auc = eval_results['metrics']['roc_auc']
    
    validation_results = {
        'performance_threshold': threshold,
        'meets_f1_threshold': f1_score >= threshold,
        'meets_accuracy_threshold': accuracy >= 0.75,  # Secondary threshold
        'meets_auc_threshold': roc_auc >= 0.75,  # Secondary threshold
        'overall_validation': False
    }
    
    # Overall validation
    validation_results['overall_validation'] = (
        validation_results['meets_f1_threshold'] and
        validation_results['meets_accuracy_threshold'] and
        validation_results['meets_auc_threshold']
    )
    
    # Performance status
    if validation_results['overall_validation']:
        status = "APPROVED_FOR_PRODUCTION"
        logger.info("Model validation PASSED - approved for production")
    else:
        status = "REQUIRES_IMPROVEMENT"
        logger.warning("Model validation FAILED - requires improvement")
    
    validation_results['deployment_status'] = status
    
    return validation_results

# Validate performance
validation = validate_model_performance(eval_results, performance_threshold)

print("\nModel Performance Validation:")
print("=" * 40)
print(f"Performance Threshold: {validation['performance_threshold']}")
print(f"F1 Score Check: {'✓' if validation['meets_f1_threshold'] else '✗'}")
print(f"Accuracy Check: {'✓' if validation['meets_accuracy_threshold'] else '✗'}")
print(f"ROC AUC Check: {'✓' if validation['meets_auc_threshold'] else '✗'}")
print(f"\nDeployment Status: {validation['deployment_status']}")

## Model Registry and Versioning

In [None]:
def register_model_version(model, eval_results, validation, registry_path):
    """Register model version in model registry."""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_version = f"v_{timestamp}"
    
    # Create version directory
    version_dir = f"{registry_path}/{model_version}"
    os.makedirs(version_dir, exist_ok=True)
    
    # Save model
    model_path = f"{version_dir}/model.pkl"
    joblib.dump(model, model_path)
    
    # Create comprehensive model card
    model_card = {
        'model_info': {
            'version': model_version,
            'model_type': type(model).__name__,
            'training_timestamp': metadata.get('timestamp', 'unknown'),
            'evaluation_timestamp': timestamp,
            'model_parameters': model.get_params() if hasattr(model, 'get_params') else {}
        },
        'performance': eval_results['metrics'],
        'validation': validation,
        'data_info': {
            'features': list(X_test.columns),
            'n_features': len(X_test.columns),
            'test_samples': len(X_test)
        },
        'deployment_info': {
            'ready_for_deployment': validation['overall_validation'],
            'deployment_status': validation['deployment_status'],
            'recommended_for_production': validation['overall_validation']
        }
    }
    
    # Save model card
    model_card_path = f"{version_dir}/model_card.json"
    with open(model_card_path, 'w') as f:
        json.dump(model_card, f, indent=2)
    
    # Update registry index
    registry_index_path = f"{registry_path}/registry_index.json"
    
    if os.path.exists(registry_index_path):
        with open(registry_index_path, 'r') as f:
            registry_index = json.load(f)
    else:
        registry_index = {'versions': []}
    
    # Add new version
    version_entry = {
        'version': model_version,
        'timestamp': timestamp,
        'f1_score': eval_results['metrics']['f1_score'],
        'deployment_status': validation['deployment_status'],
        'model_path': model_path,
        'model_card_path': model_card_path
    }
    
    registry_index['versions'].append(version_entry)
    
    # Sort by timestamp (newest first)
    registry_index['versions'].sort(key=lambda x: x['timestamp'], reverse=True)
    
    # Update latest production model if this one is approved
    if validation['overall_validation']:
        registry_index['latest_production'] = version_entry
    
    # Save updated index
    with open(registry_index_path, 'w') as f:
        json.dump(registry_index, f, indent=2)
    
    logger.info(f"Model version {model_version} registered successfully")
    
    return {
        'version': model_version,
        'version_dir': version_dir,
        'model_path': model_path,
        'model_card_path': model_card_path,
        'registry_index_path': registry_index_path
    }

# Register model version
registry_info = register_model_version(
    model, eval_results, validation, model_registry_path
)

print("Model Registration Completed:")
print("=" * 40)
print(f"Model Version: {registry_info['version']}")
print(f"Registry Path: {registry_info['version_dir']}")
print(f"Model Card: {registry_info['model_card_path']}")

## Generate Evaluation Report

In [None]:
def generate_evaluation_report(eval_results, validation, registry_info):
    """Generate comprehensive evaluation report."""
    
    report = {
        'evaluation_summary': {
            'timestamp': datetime.now().isoformat(),
            'model_version': registry_info['version'],
            'model_type': type(model).__name__,
            'overall_status': validation['deployment_status']
        },
        'performance_metrics': eval_results['metrics'],
        'validation_results': validation,
        'deployment_recommendation': {
            'ready_for_production': validation['overall_validation'],
            'confidence_level': 'HIGH' if validation['overall_validation'] else 'LOW',
            'next_steps': [
                "Deploy to KServe" if validation['overall_validation'] else "Improve model performance",
                "Set up monitoring",
                "Configure auto-scaling"
            ] if validation['overall_validation'] else [
                "Retrain with more data",
                "Feature engineering",
                "Hyperparameter tuning"
            ]
        },
        'model_registry': {
            'version': registry_info['version'],
            'model_path': registry_info['model_path'],
            'model_card': registry_info['model_card_path']
        }
    }
    
    # Save report
    report_path = f"{evaluation_output_path}/evaluation_report.json"
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)
    
    return report_path

# Generate report
report_path = generate_evaluation_report(eval_results, validation, registry_info)

print("Evaluation Report Generated:")
print("=" * 40)
print(f"Report saved to: {report_path}")
print(f"\nDeployment Recommendation: {'PROCEED' if validation['overall_validation'] else 'IMPROVE MODEL'}")

## Pipeline Output Summary

In [None]:
# Pipeline outputs for deployment
pipeline_outputs = {
    'evaluation_report': report_path,
    'model_registry_version': registry_info['version'],
    'production_ready_model': registry_info['model_path'],
    'model_card': registry_info['model_card_path'],
    'deployment_approved': validation['overall_validation'],
    'deployment_status': validation['deployment_status'],
    'model_performance': eval_results['metrics'],
    'registry_index': registry_info['registry_index_path']
}

print("Model Evaluation Pipeline Component Completed Successfully!")
print("=" * 60)

print(f"\nModel Version: {registry_info['version']}")
print(f"Deployment Status: {validation['deployment_status']}")
print(f"Production Ready: {'Yes' if validation['overall_validation'] else 'No'}")

print("\nKey Performance Metrics:")
for metric, value in eval_results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

print("\nPipeline Outputs:")
for key, value in pipeline_outputs.items():
    if isinstance(value, str) and value.startswith('/'):
        exists = os.path.exists(value)
        print(f"  {key}: {'✓' if exists else '✗'} {value}")
    else:
        print(f"  {key}: {value}")

print("\nNext Steps:")
if validation['overall_validation']:
    print("  ✓ Model ready for KServe deployment")
    print("  ✓ Proceed to deployment pipeline")
    print("  ✓ Set up monitoring and alerting")
else:
    print("  ⚠ Model requires improvement before deployment")
    print("  ⚠ Consider retraining or feature engineering")
    print("  ⚠ Review performance thresholds")