In [0]:
# Databricks notebook source

# MAGIC %md
# MAGIC # 03. Model Training & Evaluation
# MAGIC 
# MAGIC **Production-Grade ML Pipeline - House Price Prediction**
# MAGIC 
# MAGIC This notebook:
# MAGIC - Loads processed training data
# MAGIC - Trains multiple regression models
# MAGIC - Performs hyperparameter tuning with GridSearchCV
# MAGIC - Tracks all experiments with MLflow
# MAGIC - Evaluates and compares model performance
# MAGIC - Registers the best model to MLflow Model Registry
# MAGIC 
# MAGIC ---
# MAGIC **Author:** Satish  
# MAGIC **Date:** 2026-01-17  
# MAGIC **MLflow Experiment:** House Price Prediction


In [0]:

# MAGIC %md
# MAGIC ## 1. Setup and Imports


In [0]:
%restart_python

In [0]:

import sys
import os
from pathlib import Path


In [0]:
# Add project root to path
project_root = '/Workspace/COMM - Commercial Analytics (CMAN)/MMM Quattro 2025/Satish/MLFLOW_sample'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"‚úÖ Project root: {project_root}")


In [0]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from datetime import datetime

# Scikit-learn imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Custom imports
from src.utils import (
    ConfigLoader, 
    DataLoader, 
    MLflowLogger,
    safe_display
)
from src.model import ModelTrainer, ModelEvaluator

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ All imports successful")


In [0]:

# MAGIC %md
# MAGIC ## 2. Load Configuration


In [0]:

print("="*60)
print("LOADING CONFIGURATION")
print("="*60)

# Load config
config_loader = ConfigLoader()
config = config_loader.load_config(f'{project_root}/config/config.yaml')

print(f"\n‚úÖ Configuration loaded")
print(f"  ‚Ä¢ Project: {config['project']['name']}")
print(f"  ‚Ä¢ Version: {config['project']['version']}")
print(f"  ‚Ä¢ MLflow Experiment: {config['mlflow']['experiment_name']}")


In [0]:

# MAGIC %md
# MAGIC ## 3. Setup MLflow


In [0]:

print("="*60)
print("SETTING UP MLFLOW")
print("="*60)

# Set experiment
experiment_name = config['mlflow']['experiment_name']
mlflow.set_experiment(experiment_name)

print(f"\n‚úÖ MLflow experiment set: {experiment_name}")

# Get experiment info
experiment = mlflow.get_experiment_by_name(experiment_name)
print(f"  ‚Ä¢ Experiment ID: {experiment.experiment_id}")
print(f"  ‚Ä¢ Artifact Location: {experiment.artifact_location}")


In [0]:

# MAGIC %md
# MAGIC ## 4. Load Processed Data


In [0]:

print("="*60)
print("LOADING PROCESSED DATA")
print("="*60)

processed_path = config['data']['processed_path']

# Load datasets
X_train = DataLoader.load_csv(f'{processed_path}X_train.csv')
X_test = DataLoader.load_csv(f'{processed_path}X_test.csv')
y_train = DataLoader.load_csv(f'{processed_path}y_train.csv').squeeze()
y_test = DataLoader.load_csv(f'{processed_path}y_test.csv').squeeze()

print(f"\n‚úÖ Data loaded successfully")
print(f"  ‚Ä¢ X_train: {X_train.shape}")
print(f"  ‚Ä¢ X_test: {X_test.shape}")
print(f"  ‚Ä¢ y_train: {y_train.shape}")
print(f"  ‚Ä¢ y_test: {y_test.shape}")

# Show feature names
print(f"\nüìã Features ({len(X_train.columns)}):")
for i, col in enumerate(X_train.columns, 1):
    print(f"  {i:2d}. {col}")

# Show target statistics
print(f"\nüìä Target Variable Statistics:")
print(f"  Training Set:")
print(f"    Mean:   ${y_train.mean():,.2f}")
print(f"    Median: ${y_train.median():,.2f}")
print(f"    Std:    ${y_train.std():,.2f}")
print(f"    Range:  [${y_train.min():,.2f}, ${y_train.max():,.2f}]")
print(f"\n  Test Set:")
print(f"    Mean:   ${y_test.mean():,.2f}")
print(f"    Median: ${y_test.median():,.2f}")
print(f"    Std:    ${y_test.std():,.2f}")
print(f"    Range:  [${y_test.min():,.2f}, ${y_test.max():,.2f}]")


In [0]:
# MAGIC %md
# MAGIC ## 5. Initialize Model Trainer


In [0]:

print("="*60)
print("INITIALIZING MODEL TRAINER")
print("="*60)

# Initialize trainer
trainer = ModelTrainer(config)

print(f"\n‚úÖ ModelTrainer initialized")
print(f"  ‚Ä¢ Available models: {len(config['models'])}")
print(f"  ‚Ä¢ CV folds: {config['training']['cv_folds']}")
print(f"  ‚Ä¢ Scoring metric: {config['training']['scoring']}")

# Show configured models
print(f"\nüìã Configured Models:")
for i, model_name in enumerate(config['models'].keys(), 1):
    print(f"  {i}. {model_name}")


In [0]:

# MAGIC %md
# MAGIC ## 6. Train Models


In [0]:

print("="*60)
print("TRAINING MODELS")
print("="*60)

# Dictionary to store results
results = {}

# Models to train
models_to_train = ['linear_regression', 'ridge', 'lasso', 'random_forest', 'gradient_boosting']

print(f"\nüöÄ Training {len(models_to_train)} models with hyperparameter tuning...\n")

for model_name in models_to_train:
    print(f"\n{'='*60}")
    print(f"Training: {model_name.upper()}")
    print(f"{'='*60}")
    
    try:
        # Train model with hyperparameter tuning
        best_model, best_params, cv_results = trainer.train_model(
            model_name=model_name,
            X_train=X_train,
            y_train=y_train,
            tune_hyperparameters=True
        )
        
        # Make predictions
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)
        
        # Calculate metrics
        train_metrics = {
            'rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
            'mae': mean_absolute_error(y_train, y_train_pred),
            'r2': r2_score(y_train, y_train_pred)
        }
        
        test_metrics = {
            'rmse': np.sqrt(mean_squared_error(y_test, y_test_pred)),
            'mae': mean_absolute_error(y_test, y_test_pred),
            'r2': r2_score(y_test, y_test_pred)
        }
        
        # Store results
        results[model_name] = {
            'model': best_model,
            'params': best_params,
            'cv_results': cv_results,
            'train_metrics': train_metrics,
            'test_metrics': test_metrics,
            'y_train_pred': y_train_pred,
            'y_test_pred': y_test_pred
        }
        
        # Print results
        print(f"\n‚úÖ {model_name.upper()} Training Complete")
        print(f"\nüìä Best Parameters:")
        for param, value in best_params.items():
            print(f"  ‚Ä¢ {param}: {value}")
        
        print(f"\nüìà Training Metrics:")
        print(f"  ‚Ä¢ RMSE: ${train_metrics['rmse']:,.2f}")
        print(f"  ‚Ä¢ MAE:  ${train_metrics['mae']:,.2f}")
        print(f"  ‚Ä¢ R¬≤:   {train_metrics['r2']:.4f}")
        
        print(f"\nüìâ Test Metrics:")
        print(f"  ‚Ä¢ RMSE: ${test_metrics['rmse']:,.2f}")
        print(f"  ‚Ä¢ MAE:  ${test_metrics['mae']:,.2f}")
        print(f"  ‚Ä¢ R¬≤:   {test_metrics['r2']:.4f}")
        
        print(f"\n‚úÖ Model trained successfully!")
        
    except Exception as e:
        print(f"\n‚ùå Error training {model_name}: {e}")
        import traceback
        traceback.print_exc()

print(f"\n{'='*60}")
print(f"‚úÖ ALL MODELS TRAINED SUCCESSFULLY!")
print(f"{'='*60}")


In [0]:

# MAGIC %md
# MAGIC ## 7. Compare Model Performance


In [0]:

print("="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)

# Create comparison DataFrame
comparison_data = []

for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name.replace('_', ' ').title(),
        'Train RMSE': result['train_metrics']['rmse'],
        'Test RMSE': result['test_metrics']['rmse'],
        'Train MAE': result['train_metrics']['mae'],
        'Test MAE': result['test_metrics']['mae'],
        'Train R¬≤': result['train_metrics']['r2'],
        'Test R¬≤': result['test_metrics']['r2'],
        'Overfit (RMSE)': result['train_metrics']['rmse'] - result['test_metrics']['rmse']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test RMSE')

print("\nüìä Model Performance Comparison:")
safe_display(comparison_df)

# Find best model
best_model_name = comparison_df.iloc[0]['Model'].lower().replace(' ', '_')
print(f"\nüèÜ Best Model: {comparison_df.iloc[0]['Model']}")
print(f"  ‚Ä¢ Test RMSE: ${comparison_df.iloc[0]['Test RMSE']:,.2f}")
print(f"  ‚Ä¢ Test R¬≤: {comparison_df.iloc[0]['Test R¬≤']:.4f}")


In [0]:
# MAGIC %md
# MAGIC ## 8. Visualize Model Comparison


In [0]:

print("="*60)
print("VISUALIZING MODEL COMPARISON")
print("="*60)

# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. RMSE Comparison
ax = axes[0, 0]
x = np.arange(len(comparison_df))
width = 0.35
ax.bar(x - width/2, comparison_df['Train RMSE'], width, label='Train RMSE', alpha=0.8)
ax.bar(x + width/2, comparison_df['Test RMSE'], width, label='Test RMSE', alpha=0.8)
ax.set_xlabel('Model', fontsize=11, fontweight='bold')
ax.set_ylabel('RMSE ($)', fontsize=11, fontweight='bold')
ax.set_title('RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# 2. R¬≤ Comparison
ax = axes[0, 1]
ax.bar(x - width/2, comparison_df['Train R¬≤'], width, label='Train R¬≤', alpha=0.8)
ax.bar(x + width/2, comparison_df['Test R¬≤'], width, label='Test R¬≤', alpha=0.8)
ax.set_xlabel('Model', fontsize=11, fontweight='bold')
ax.set_ylabel('R¬≤ Score', fontsize=11, fontweight='bold')
ax.set_title('R¬≤ Score Comparison (Higher is Better)', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# 3. MAE Comparison
ax = axes[1, 0]
ax.bar(x - width/2, comparison_df['Train MAE'], width, label='Train MAE', alpha=0.8)
ax.bar(x + width/2, comparison_df['Test MAE'], width, label='Test MAE', alpha=0.8)
ax.set_xlabel('Model', fontsize=11, fontweight='bold')
ax.set_ylabel('MAE ($)', fontsize=11, fontweight='bold')
ax.set_title('MAE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# 4. Overfitting Analysis
ax = axes[1, 1]
colors = ['green' if x < 0 else 'red' for x in comparison_df['Overfit (RMSE)']]
ax.barh(comparison_df['Model'], comparison_df['Overfit (RMSE)'], color=colors, alpha=0.7)
ax.set_xlabel('Train RMSE - Test RMSE ($)', fontsize=11, fontweight='bold')
ax.set_ylabel('Model', fontsize=11, fontweight='bold')
ax.set_title('Overfitting Analysis (Closer to 0 is Better)', fontsize=12, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='--', linewidth=1)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

print("\n‚úÖ Comparison visualizations complete")


In [0]:

# MAGIC %md
# MAGIC ## 9. Detailed Evaluation of Best Model


In [0]:

print("="*60)
print(f"DETAILED EVALUATION: {best_model_name.upper()}")
print("="*60)

# Get best model results
best_result = results[best_model_name]
best_model = best_result['model']

# Initialize evaluator
evaluator = ModelEvaluator(config)

# Evaluate model
evaluation_results = evaluator.evaluate_model(
    model=best_model,
    X_test=X_test,
    y_test=y_test,
    model_name=best_model_name
)

print(f"\n‚úÖ Detailed evaluation complete")


In [0]:

# MAGIC %md
# MAGIC ## 10. Feature Importance Analysis


In [0]:

print("="*60)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*60)

# Get feature importance (works for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(f"\nüìä Top 10 Most Important Features:")
    safe_display(feature_importance.head(10))
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_n = min(15, len(feature_importance))
    plt.barh(range(top_n), feature_importance['Importance'].head(top_n))
    plt.yticks(range(top_n), feature_importance['Feature'].head(top_n))
    plt.xlabel('Importance', fontsize=11, fontweight='bold')
    plt.ylabel('Feature', fontsize=11, fontweight='bold')
    plt.title(f'Top {top_n} Feature Importances - {best_model_name.title()}', 
              fontsize=12, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()
    
elif hasattr(best_model, 'coef_'):
    # For linear models, show coefficients
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Coefficient': best_model.coef_
    }).sort_values('Coefficient', key=abs, ascending=False)
    
    print(f"\nüìä Top 10 Features by Coefficient Magnitude:")
    safe_display(feature_importance.head(10))
    
    # Plot coefficients
    plt.figure(figsize=(12, 8))
    top_n = min(15, len(feature_importance))
    colors = ['green' if x > 0 else 'red' for x in feature_importance['Coefficient'].head(top_n)]
    plt.barh(range(top_n), feature_importance['Coefficient'].head(top_n), color=colors, alpha=0.7)
    plt.yticks(range(top_n), feature_importance['Feature'].head(top_n))
    plt.xlabel('Coefficient', fontsize=11, fontweight='bold')
    plt.ylabel('Feature', fontsize=11, fontweight='bold')
    plt.title(f'Top {top_n} Feature Coefficients - {best_model_name.title()}', 
              fontsize=12, fontweight='bold')
    plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()

print("\n‚úÖ Feature importance analysis complete")

In [0]:

# MAGIC %md
# MAGIC ## 11. Register Best Model


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 11. Register Best Model

# COMMAND ----------

print("="*60)
print("REGISTERING BEST MODEL TO MLFLOW")
print("="*60)


model_name_registry = config['mlflow']['model_registry_name']

print(f"\nüìã Registration Details:")
print(f"  ‚Ä¢ Registry Name: {model_name_registry}")
print(f"  ‚Ä¢ Best Model: {best_model_name}")

try:
    # Log and register model
    with mlflow.start_run(run_name=f"best_model_{best_model_name}"):
        # Log parameters
        MLflowLogger.log_params_from_dict(best_result['params'])
        
        # Log metrics
        MLflowLogger.log_metrics_from_dict({
            f"train_{k}": v for k, v in best_result['train_metrics'].items()
        })
        MLflowLogger.log_metrics_from_dict({
            f"test_{k}": v for k, v in best_result['test_metrics'].items()
        })
        
        # Log model
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="model",
            registered_model_name=model_name_registry
        )
        
        # Log comparison DataFrame
        MLflowLogger.log_dataframe_as_artifact(comparison_df, "model_comparison.csv")
        
        run_id = mlflow.active_run().info.run_id
        
    print(f"\n‚úÖ Model registered successfully!")
    print(f"  ‚Ä¢ Model Name: {model_name_registry}")
    print(f"  ‚Ä¢ Run ID: {run_id}")
    print(f"  ‚Ä¢ Best Model: {best_model_name}")
    
except Exception as e:
    print(f"\n‚ùå Error registering model: {e}")
    import traceback
    traceback.print_exc()

print("="*60)

In [0]:
#%restart_python

In [0]:

# MAGIC %md
# MAGIC ## 12. Summary


In [0]:

print("="*60)
print("TRAINING SUMMARY")
print("="*60)

print(f"\nüìä Models Trained: {len(results)}")
print(f"üèÜ Best Model: {best_model_name.title()}")
print(f"\nüìà Best Model Performance:")
print(f"  ‚Ä¢ Test RMSE: ${best_result['test_metrics']['rmse']:,.2f}")
print(f"  ‚Ä¢ Test MAE:  ${best_result['test_metrics']['mae']:,.2f}")
print(f"  ‚Ä¢ Test R¬≤:   {best_result['test_metrics']['r2']:.4f}")

print(f"\nüì¶ MLflow:")
print(f"  ‚Ä¢ Experiment: {experiment_name}")
print(f"  ‚Ä¢ Registered Model: {model_name_registry}")

print(f"\n‚úÖ Model training complete!")
print("="*60)


In [0]:

# MAGIC %md
# MAGIC ---
# MAGIC ## ‚úÖ Next Steps
# MAGIC 
# MAGIC 1. **Model Deployment**: Deploy the best model for inference
# MAGIC 2. **Monitoring**: Set up model monitoring and drift detection
# MAGIC 3. **Retraining**: Schedule periodic model retraining
# MAGIC 4. **A/B Testing**: Compare new models against production model