In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # House Price Prediction - Feature Engineering
# MAGIC ## MLflow Production Pipeline - Step 2
# MAGIC 
# MAGIC **Objective**: Transform raw data into ML-ready features
# MAGIC 
# MAGIC **Author**: Satish  
# MAGIC **Date**: 2026-01-17
# MAGIC 
# MAGIC ---
# MAGIC 
# MAGIC ### What This Notebook Does:
# MAGIC - ‚úÖ Load and validate raw data
# MAGIC - ‚úÖ Create derived features
# MAGIC - ‚úÖ Encode categorical variables
# MAGIC - ‚úÖ Scale numerical features
# MAGIC - ‚úÖ Split train/test datasets
# MAGIC - ‚úÖ Save processed data and transformers
# MAGIC - ‚úÖ Log everything to MLflow


In [0]:
# MAGIC %md
# MAGIC ## 1. Setup and Imports


In [0]:
#%restart_python

In [0]:
# Clear cached modules and reload
import sys
import importlib

# Add project path
project_path = '/Workspace/COMM - Commercial Analytics (CMAN)/MMM Quattro 2025/Satish/MLFLOW_sample'
if project_path not in sys.path:
    sys.path.append(project_path)

# Remove cached src modules
modules_to_clear = [key for key in list(sys.modules.keys()) if key.startswith('src')]
for module in modules_to_clear:
    del sys.modules[module]

print(f"üîÑ Cleared {len(modules_to_clear)} cached modules")

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import warnings
warnings.filterwarnings('ignore')

# Custom imports
from src.utils import (
    ConfigLoader, 
    DataLoader, 
    DataValidator,
    safe_display,
    setup_mlflow_databricks,
    MLflowLogger,
    log_dataset_summary
)
from src.data_processing import DataProcessor
from src.feature_engineering import FeatureEngineer

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ All imports successful!")
print(f"üì¶ Pandas version: {pd.__version__}")
print(f"üì¶ NumPy version: {np.__version__}")
print(f"üì¶ MLflow version: {mlflow.__version__}")


In [0]:

# MAGIC %md
# MAGIC ## 2. Load Configuration


In [0]:

# Load configuration
config_path = '/Workspace/COMM - Commercial Analytics (CMAN)/MMM Quattro 2025/Satish/MLFLOW_sample/config/config.yaml'
config = ConfigLoader.load_config(config_path)

print("="*60)
print("CONFIGURATION LOADED")
print("="*60)
print(f"Project: {config['project']['name']}")
print(f"Version: {config['project']['version']}")
print(f"Target: {config['preprocessing']['target']}")
print(f"Test Size: {config['preprocessing']['test_size']}")
print(f"Random State: {config['preprocessing']['random_state']}")
print("="*60)

In [0]:

# MAGIC %md
# MAGIC ## 3. Setup MLflow


In [0]:

# Setup MLflow for Databricks
setup_mlflow_databricks(config)

# Start MLflow run for feature engineering
mlflow.start_run(run_name="feature_engineering")

print("‚úÖ MLflow tracking started")
print(f"üìä Experiment: {config['mlflow']['experiment_name']}")
print(f"üîó Run ID: {mlflow.active_run().info.run_id}")


In [0]:

# MAGIC %md
# MAGIC ## 4. Load Raw Data


In [0]:

# Load data
data_path = config['data']['raw_path']
df = DataLoader.load_csv(data_path)

print("="*60)
print("RAW DATA LOADED")
print("="*60)
print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"Memory: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
print("="*60)

print("\nüìã First 5 rows:")
safe_display(df.head())

print("\nüìä Data types:")
print(df.dtypes.to_string())


In [0]:

# MAGIC %md
# MAGIC ## 5. Initialize Processors


In [0]:

# Initialize data processor and feature engineer
processor = DataProcessor(config)
feature_engineer = FeatureEngineer(config)

print("‚úÖ DataProcessor initialized")
print("‚úÖ FeatureEngineer initialized")


In [0]:

# MAGIC %md
# MAGIC ## 6. Data Validation


In [0]:

# Validate data
print("="*60)
print("DATA VALIDATION")
print("="*60)

validator = DataValidator()

# Check required columns
all_required_cols = (
    config['preprocessing']['numerical_features'] + 
    config['preprocessing']['categorical_features'] + 
    [config['preprocessing']['target']]
)

try:
    validator.validate_dataframe(df, all_required_cols)
    print("‚úÖ All required columns present")
except ValueError as e:
    print(f"‚ùå Validation failed: {e}")
    raise

# Check missing values
missing = validator.check_missing_values(df)
mlflow.log_metric("raw_missing_values", int(missing.sum()))

# Check duplicates
duplicates = validator.check_duplicates(df)
mlflow.log_metric("raw_duplicates", int(duplicates))

print("\n" + "="*60)
print("VALIDATION COMPLETE")
print("="*60)


In [0]:

# MAGIC %md
# MAGIC ## 7. Feature Engineering


In [0]:

# MAGIC %md
# MAGIC ### 7.1 Create New Features


In [0]:

print("="*60)
print("CREATING NEW FEATURES")
print("="*60)

# Create features
df_engineered = feature_engineer.create_features(df)

# Show new features
new_features = set(df_engineered.columns) - set(df.columns)
print(f"\n‚úÖ Created {len(new_features)} new features:")
for i, feat in enumerate(new_features, 1):
    print(f"  {i}. {feat}")

print(f"\nüìä New shape: {df_engineered.shape[0]} rows √ó {df_engineered.shape[1]} columns")

# Display sample
print("\nüìã Sample with new features:")
safe_display(df_engineered.head())

# Log to MLflow
mlflow.log_metric("features_created", len(new_features))
mlflow.log_metric("total_features", df_engineered.shape[1])

In [0]:

# MAGIC %md
# MAGIC ### 7.2 Analyze New Features


In [0]:

# Analyze new features
print("="*60)
print("NEW FEATURES ANALYSIS")
print("="*60)

target = config['preprocessing']['target']

# Calculate correlations with target
new_feature_list = list(new_features)
if new_feature_list:
    correlations = {}
    for feat in new_feature_list:
        if pd.api.types.is_numeric_dtype(df_engineered[feat]):
            corr = df_engineered[feat].corr(df_engineered[target])
            correlations[feat] = corr
    
    # Sort by absolute correlation
    sorted_corr = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
    
    print("\nüîó Correlation with target:")
    for feat, corr in sorted_corr:
        print(f"  {feat}: {corr:.4f}")
    
    # Visualize
    if sorted_corr:
        fig, axes = plt.subplots(1, len(sorted_corr), figsize=(6*len(sorted_corr), 5))
        if len(sorted_corr) == 1:
            axes = [axes]
        
        for idx, (feat, corr) in enumerate(sorted_corr):
            axes[idx].scatter(df_engineered[feat], df_engineered[target], 
                            alpha=0.6, color='purple', edgecolors='black', linewidth=0.5)
            axes[idx].set_xlabel(feat, fontsize=11)
            axes[idx].set_ylabel(target, fontsize=11)
            axes[idx].set_title(f'{feat} vs {target}\n(r = {corr:.3f})', 
                              fontsize=12, fontweight='bold')
            axes[idx].grid(True, alpha=0.3)
            
            # Add trend line
            z = np.polyfit(df_engineered[feat], df_engineered[target], 1)
            p = np.poly1d(z)
            axes[idx].plot(df_engineered[feat], p(df_engineered[feat]), 
                          "r--", alpha=0.8, linewidth=2, label='Trend')
            axes[idx].legend()
        
        plt.tight_layout()
        plt.show()
    
    # Log top correlation
    if sorted_corr:
        mlflow.log_metric("top_new_feature_corr", abs(sorted_corr[0][1]))


In [0]:

# MAGIC %md
# MAGIC ### 7.3 Feature Statistics


In [0]:

# Statistics for new features
print("="*60)
print("NEW FEATURES STATISTICS")
print("="*60)

for feat in new_feature_list:
    if pd.api.types.is_numeric_dtype(df_engineered[feat]):
        print(f"\n{feat}:")
        print(f"  Mean:     {df_engineered[feat].mean():.2f}")
        print(f"  Median:   {df_engineered[feat].median():.2f}")
        print(f"  Std:      {df_engineered[feat].std():.2f}")
        print(f"  Min:      {df_engineered[feat].min():.2f}")
        print(f"  Max:      {df_engineered[feat].max():.2f}")
        print(f"  Skewness: {df_engineered[feat].skew():.4f}")


In [0]:

# MAGIC %md
# MAGIC ## 8. Encode Categorical Features


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Encode Categorical Features

# COMMAND ----------

print("="*60)
print("ENCODING CATEGORICAL FEATURES")
print("="*60)

# Encode categorical features - USE feature_engineer, NOT processor!
df_encoded, label_encoders = feature_engineer.encode_categorical(df_engineered)

print(f"\n‚úÖ Encoded {len(label_encoders)} categorical features:")
for col, encoder in label_encoders.items():
    print(f"\n  üìå {col}:")
    print(f"    Classes: {list(encoder.classes_)}")
    mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    for original, encoded in mapping.items():
        print(f"      {original} ‚Üí {encoded}")

# Show before/after
print("\nüìã Before encoding:")
safe_display(df_engineered[config['preprocessing']['categorical_features']].head())

print("\nüìã After encoding:")
safe_display(df_encoded[config['preprocessing']['categorical_features']].head())

# Log to MLflow
mlflow.log_metric("categorical_features_encoded", len(label_encoders))

In [0]:

# MAGIC %md
# MAGIC ## 9. Train-Test Split


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 9. Train-Test Split

# COMMAND ----------

print("="*60)
print("TRAIN-TEST SPLIT")
print("="*60)

# Split data
X_train, X_test, y_train, y_test = processor.split_data(df_encoded)

print(f"\nüìä Training set:")
print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  y_train range: [{y_train.min():.2f}, {y_train.max():.2f}]")

print(f"\nüìä Test set:")
print(f"  X_test: {X_test.shape}")
print(f"  y_test: {y_test.shape}")
print(f"  y_test range: [{y_test.min():.2f}, {y_test.max():.2f}]")

print(f"\nüìà Split ratio:")
print(f"  Train: {len(X_train) / len(df_encoded) * 100:.1f}%")
print(f"  Test:  {len(X_test) / len(df_encoded) * 100:.1f}%")

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(y_train, bins=30, alpha=0.7, color='blue', edgecolor='black')
axes[0].set_title('Training Set - Target Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel(config['preprocessing']['target'], fontsize=11)
axes[0].set_ylabel('Frequency', fontsize=11)
axes[0].grid(True, alpha=0.3)

axes[1].hist(y_test, bins=30, alpha=0.7, color='green', edgecolor='black')
axes[1].set_title('Test Set - Target Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel(config['preprocessing']['target'], fontsize=11)
axes[1].set_ylabel('Frequency', fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Log to MLflow
mlflow.log_params({
    "train_size": len(X_train),
    "test_size": len(X_test),
    "test_ratio": config['preprocessing']['test_size'],
    "random_state": config['preprocessing']['random_state']
})

mlflow.log_metrics({
    "y_train_mean": float(y_train.mean()),
    "y_train_std": float(y_train.std()),
    "y_test_mean": float(y_test.mean()),
    "y_test_std": float(y_test.std())
})

In [0]:

# MAGIC %md
# MAGIC ## 10. Feature Scaling


In [0]:

print("="*60)
print("FEATURE SCALING")
print("="*60)

# Scale features
X_train_scaled, X_test_scaled, scaler = processor.scale_features(X_train, X_test)

print(f"\n‚úÖ Scaled {X_train_scaled.shape[1]} features using StandardScaler")

# Show scaling statistics
print("\nüìä Scaling parameters (first 5 features):")
for i, col in enumerate(X_train.columns[:5]):
    print(f"  {col}:")
    print(f"    Mean:  {scaler.mean_[i]:>10.4f}")
    print(f"    Scale: {scaler.scale_[i]:>10.4f}")

# Compare before/after for first 3 features
print("\nüìà Scaling effect (first 3 features):")
for i, col in enumerate(X_train.columns[:3]):
    print(f"\n  {col}:")
    print(f"    Before: mean={X_train[col].mean():>8.2f}, std={X_train[col].std():>8.2f}")
    print(f"    After:  mean={X_train_scaled[col].mean():>8.2f}, std={X_train_scaled[col].std():>8.2f}")

# Visualize scaling effect
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before scaling
box_data_before = [X_train[col] for col in X_train.columns[:5]]
axes[0].boxplot(box_data_before, labels=X_train.columns[:5])
axes[0].set_title('Before Scaling (First 5 Features)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Value', fontsize=11)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# After scaling
box_data_after = [X_train_scaled[col] for col in X_train_scaled.columns[:5]]
axes[1].boxplot(box_data_after, labels=X_train_scaled.columns[:5])
axes[1].set_title('After Scaling (First 5 Features)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Scaled Value', fontsize=11)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Log to MLflow
mlflow.log_param("scaling_method", "StandardScaler")


In [0]:

# MAGIC %md
# MAGIC ## 11. Final Feature Set


In [0]:

print("="*60)
print("FINAL FEATURE SET")
print("="*60)

print(f"\nüìä Total features: {X_train_scaled.shape[1]}")
print(f"\nüìã Feature list:")
for i, col in enumerate(X_train_scaled.columns, 1):
    is_new = "üÜï" if col in new_features else "  "
    print(f"  {i:2d}. {is_new} {col}")

# Feature types breakdown
numerical_features = config['preprocessing']['numerical_features']
categorical_features = config['preprocessing']['categorical_features']

print(f"\nüìà Feature breakdown:")
print(f"  Original numerical:   {len(numerical_features)}")
print(f"  Original categorical: {len(categorical_features)}")
print(f"  Engineered features:  {len(new_features)}")
print(f"  {'‚îÄ' * 40}")
print(f"  Total:                {X_train_scaled.shape[1]}")

# Display sample
print("\nüìã Training data sample (scaled):")
safe_display(X_train_scaled.head())

print("\nüìã Target variable sample:")
safe_display(y_train.head().to_frame())


In [0]:

# MAGIC %md
# MAGIC ## 12. Data Quality Report


In [0]:

print("="*60)
print("PROCESSED DATA QUALITY REPORT")
print("="*60)

# Check for any issues in processed data
print("\nüîç Quality checks:")

# Missing values
missing_train = X_train_scaled.isnull().sum().sum()
missing_test = X_test_scaled.isnull().sum().sum()
print(f"  Missing values (train): {missing_train}")
print(f"  Missing values (test):  {missing_test}")

# Infinite values
inf_train = np.isinf(X_train_scaled.values).sum()
inf_test = np.isinf(X_test_scaled.values).sum()
print(f"  Infinite values (train): {inf_train}")
print(f"  Infinite values (test):  {inf_test}")

# Data types
print(f"\nüìã Data types:")
print(X_train_scaled.dtypes.value_counts().to_string())

# Summary statistics
print(f"\nüìä Summary statistics (train):")
summary_stats = X_train_scaled.describe()
safe_display(summary_stats)

# Log quality metrics
mlflow.log_metrics({
    "processed_missing_train": int(missing_train),
    "processed_missing_test": int(missing_test),
    "processed_inf_train": int(inf_train),
    "processed_inf_test": int(inf_test)
})

if missing_train == 0 and missing_test == 0 and inf_train == 0 and inf_test == 0:
    print("\n‚úÖ Data quality: EXCELLENT - Ready for modeling!")
    mlflow.log_param("data_quality", "EXCELLENT")
else:
    print("\n‚ö†Ô∏è Data quality issues detected - review before modeling")
    mlflow.log_param("data_quality", "ISSUES_DETECTED")


In [0]:
# MAGIC %md
# MAGIC ## 13. Save Processed Data


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 11. Save Processed Data

# COMMAND ----------

print("="*60)
print("SAVING PROCESSED DATA")
print("="*60)

processed_path = config['data']['processed_path']

# Ensure processed directory exists
import os
os.makedirs(processed_path, exist_ok=True)

# Save datasets
from src.utils import DataLoader

try:
    DataLoader.save_csv(X_train_scaled, f'{processed_path}X_train.csv')
    DataLoader.save_csv(X_test_scaled, f'{processed_path}X_test.csv')
    DataLoader.save_csv(y_train.to_frame(), f'{processed_path}y_train.csv')
    DataLoader.save_csv(y_test.to_frame(), f'{processed_path}y_test.csv')
    
    print("\n‚úÖ Datasets saved:")
    print(f"  ‚Ä¢ X_train.csv: {X_train_scaled.shape}")
    print(f"  ‚Ä¢ X_test.csv:  {X_test_scaled.shape}")
    print(f"  ‚Ä¢ y_train.csv: {y_train.shape}")
    print(f"  ‚Ä¢ y_test.csv:  {y_test.shape}")
except Exception as e:
    print(f"‚ùå Error saving datasets: {e}")
    raise

# Save preprocessors
import pickle

try:
    # Save label encoders
    with open(f'{processed_path}label_encoders.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)
    print("\n‚úÖ Label encoders saved")
    
    # Save scaler
    with open(f'{processed_path}scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    print("‚úÖ Scaler saved")
except Exception as e:
    print(f"‚ùå Error saving preprocessors: {e}")
    raise

# Save feature names (ensure all are JSON-serializable)
try:
    # Helper function to convert to list
    def to_list(obj):
        if isinstance(obj, (set, tuple)):
            return list(obj)
        elif isinstance(obj, list):
            return obj
        else:
            return [obj]
    
    feature_names = {
        'all_features': list(X_train_scaled.columns),
        'created_features': to_list(new_features),
        'numerical_features': to_list(numerical_features),
        'categorical_features': to_list(categorical_features),
        'target': config['preprocessing']['target'],
        'total_features': len(X_train_scaled.columns),
        'train_samples': len(X_train_scaled),
        'test_samples': len(X_test_scaled)
    }
    
    import json
    with open(f'{processed_path}feature_names.json', 'w') as f:
        json.dump(feature_names, f, indent=2)
    print("‚úÖ Feature names saved")
    
    # Display saved feature info
    print(f"\nüìã Feature Information:")
    print(f"  ‚Ä¢ Total features: {feature_names['total_features']}")
    print(f"  ‚Ä¢ Created features: {len(feature_names['created_features'])}")
    print(f"  ‚Ä¢ Training samples: {feature_names['train_samples']}")
    print(f"  ‚Ä¢ Test samples: {feature_names['test_samples']}")
    
except Exception as e:
    print(f"‚ùå Error saving feature names: {e}")
    raise

print(f"\nüìÅ All files saved to: {processed_path}")

# Log artifacts to MLflow
try:
    mlflow.log_artifact(f'{processed_path}feature_names.json')
    mlflow.log_artifact(f'{processed_path}label_encoders.pkl')
    mlflow.log_artifact(f'{processed_path}scaler.pkl')
    
    # Log summary metrics
    mlflow.log_params({
        'total_features': feature_names['total_features'],
        'created_features_count': len(feature_names['created_features']),
        'train_samples': feature_names['train_samples'],
        'test_samples': feature_names['test_samples']
    })
    
    print("\n‚úÖ Artifacts logged to MLflow")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Could not log to MLflow: {e}")

print("\n" + "="*60)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("="*60)

In [0]:
# MAGIC %md
# MAGIC ## 12. Data Quality Report


In [0]:

print("="*60)
print("PROCESSED DATA QUALITY REPORT")
print("="*60)

# Check for any issues in processed data
print("\nüîç Quality checks:")

# Missing values
missing_train = X_train_scaled.isnull().sum().sum()
missing_test = X_test_scaled.isnull().sum().sum()
print(f"  Missing values (train): {missing_train}")
print(f"  Missing values (test):  {missing_test}")

# Infinite values
inf_train = np.isinf(X_train_scaled.values).sum()
inf_test = np.isinf(X_test_scaled.values).sum()
print(f"  Infinite values (train): {inf_train}")
print(f"  Infinite values (test):  {inf_test}")

# Data types
print(f"\nüìã Data types:")
print(X_train_scaled.dtypes.value_counts().to_string())

# Summary statistics
print(f"\nüìä Summary statistics (train):")
summary_stats = X_train_scaled.describe()
safe_display(summary_stats)

# Log quality metrics
mlflow.log_metrics({
    "processed_missing_train": int(missing_train),
    "processed_missing_test": int(missing_test),
    "processed_inf_train": int(inf_train),
    "processed_inf_test": int(inf_test)
})

if missing_train == 0 and missing_test == 0 and inf_train == 0 and inf_test == 0:
    print("\n‚úÖ Data quality: EXCELLENT - Ready for modeling!")
    mlflow.log_param("data_quality", "EXCELLENT")
else:
    print("\n‚ö†Ô∏è Data quality issues detected - review before modeling")
    mlflow.log_param("data_quality", "ISSUES_DETECTED")


In [0]:

# MAGIC %md
# MAGIC ## 13. Save Processed Data


In [0]:

print("="*60)
print("SAVING PROCESSED DATA")
print("="*60)

processed_path = config['data']['processed_path']

# Save datasets
DataLoader.save_csv(X_train_scaled, f'{processed_path}X_train.csv')
DataLoader.save_csv(X_test_scaled, f'{processed_path}X_test.csv')
DataLoader.save_csv(y_train.to_frame(), f'{processed_path}y_train.csv')
DataLoader.save_csv(y_test.to_frame(), f'{processed_path}y_test.csv')

print("\n‚úÖ Datasets saved:")
print(f"  ‚Ä¢ X_train.csv: {X_train_scaled.shape}")
print(f"  ‚Ä¢ X_test.csv:  {X_test_scaled.shape}")
print(f"  ‚Ä¢ y_train.csv: {y_train.shape}")
print(f"  ‚Ä¢ y_test.csv:  {y_test.shape}")

# Save preprocessors
import pickle

# Save label encoders
with open(f'{processed_path}label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
print("\n‚úÖ Label encoders saved")

# Save scaler
with open(f'{processed_path}scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("‚úÖ Scaler saved")

# Save feature names
feature_names = {
    'all_features': list(X_train_scaled.columns),
    'created_features': new_features,
    'numerical_features': numerical_features,
    'categorical_features': categorical_features
}
import json
with open(f'{processed_path}feature_names.json', 'w') as f:
    json.dump(list(feature_names), f, indent=2)
print("‚úÖ Feature names saved")

print(f"\nüìÅ All files saved to: {processed_path}")

In [0]:

# MAGIC %md
# MAGIC ## 14. Log Artifacts to MLflow


In [0]:
# COMMAND ----------
from src.utils import MLflowLogger as mlflow_logger
# MAGIC %md
# MAGIC ## 12. Log to MLflow

# COMMAND ----------

print("="*60)
print("LOGGING TO MLFLOW")
print("="*60)

# Log datasets as artifacts
mlflow_logger.log_dataframe_as_artifact(X_train_scaled, "X_train.csv")
mlflow_logger.log_dataframe_as_artifact(X_test_scaled, "X_test.csv")
mlflow_logger.log_dataframe_as_artifact(y_train.to_frame(), "y_train.csv")
mlflow_logger.log_dataframe_as_artifact(y_test.to_frame(), "y_test.csv")

print("‚úÖ Datasets logged to MLflow")

# Log feature names
feature_info = {
    "total_features": int(X_train_scaled.shape[1]),
    "feature_names": list(X_train_scaled.columns),
    "numerical_features": list(numerical_features) if isinstance(numerical_features, set) else numerical_features,
    "categorical_features": list(categorical_features) if isinstance(categorical_features, set) else categorical_features,
    "engineered_features": list(new_features) if isinstance(new_features, set) else new_features
}
mlflow_logger.log_artifact_from_dict(feature_info, "feature_info.json")

print("‚úÖ Feature info logged to MLflow")

# Log preprocessing info
preprocessing_info = {
    "label_encoders": {k: list(v.classes_) for k, v in label_encoders.items()},
    "scaler_mean": scaler.mean_.tolist(),
    "scaler_scale": scaler.scale_.tolist(),
    "train_size": int(len(X_train)),
    "test_size": int(len(X_test))
}
mlflow_logger.log_artifact_from_dict(preprocessing_info, "preprocessing_info.json")

print("‚úÖ Preprocessing info logged to MLflow")

# Log feature statistics
# feature_stats_dict = feature_stats.to_dict('records')
# mlflow_logger.log_artifact_from_dict({"feature_statistics": feature_stats_dict}, "feature_statistics.json")

print("‚úÖ Feature statistics logged to MLflow")

print("\nüìä All artifacts logged successfully!")

In [0]:
# Log feature names
feature_info = {
    "total_features": int(X_train_scaled.shape[1]),
    "feature_names": list(X_train_scaled.columns),
    "numerical_features": numerical_features,
    "categorical_features": categorical_features,
    "engineered_features": new_features
}
mlflow_logger.log_artifact_from_dict(feature_info, "feature_info.json")

print("‚úÖ Feature info logged to MLflow")

# Log preprocessing info
preprocessing_info = {
    "label_encoders": {k: list(v.classes_) for k, v in label_encoders.items()},
    "scaler_mean": scaler.mean_.tolist(),
    "scaler_scale": scaler.scale_.tolist(),
    "train_size": int(len(X_train)),
    "test_size": int(len(X_test))
}
mlflow_logger.log_artifact_from_dict(preprocessing_info, "preprocessing_info.json")

print("‚úÖ Preprocessing info logged to MLflow")

print("‚úÖ Feature statistics logged to MLflow")

print("\nüìä All artifacts logged successfully!")


In [0]:

# MAGIC %md
# MAGIC ## 15. Feature Engineering Summary


In [0]:

print("="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)

print(f"""
üìä DATA TRANSFORMATION COMPLETE

INPUT:
  ‚Ä¢ Raw data: {df.shape[0]} rows √ó {df.shape[1]} columns
  ‚Ä¢ Numerical features: {len(numerical_features)}
  ‚Ä¢ Categorical features: {len(categorical_features)}

TRANSFORMATIONS:
  ‚úì Created {len(new_features)} engineered features
  ‚úì Encoded {len(label_encoders)} categorical variables
  ‚úì Scaled all numerical features
  ‚úì Split into train/test sets

OUTPUT:
  ‚Ä¢ Training set: {X_train_scaled.shape[0]} samples √ó {X_train_scaled.shape[1]} features
  ‚Ä¢ Test set: {X_test_scaled.shape[0]} samples √ó {X_test_scaled.shape[1]} features
  ‚Ä¢ Target variable: {target}

DATA QUALITY:
  ‚úì Missing values: {missing_train + missing_test}
  ‚úì Infinite values: {inf_train + inf_test}
  ‚úì Data types: All numeric
  ‚úì Scaling: StandardScaler applied

SAVED FILES:
  üìÅ {processed_path}
    ‚Ä¢ X_train.csv
    ‚Ä¢ X_test.csv
    ‚Ä¢ y_train.csv
    ‚Ä¢ y_test.csv
    ‚Ä¢ label_encoders.pkl
    ‚Ä¢ scaler.pkl
    ‚Ä¢ feature_names.json

MLFLOW:
  ‚úì All metrics logged
  ‚úì All parameters logged
  ‚úì All artifacts logged
  üîó Run ID: {mlflow.active_run().info.run_id}

STATUS: ‚úÖ READY FOR MODEL TRAINING
""")

print("="*60)



In [0]:
# MAGIC %md
# MAGIC ## 16. Next Steps



In [0]:

print("="*60)
print("NEXT STEPS")
print("="*60)

print("""
üéØ READY FOR MODEL TRAINING (Notebook 03)

The processed data is now ready for:
  1. ‚úÖ Model training with multiple algorithms
  2. ‚úÖ Hyperparameter tuning
  3. ‚úÖ Model evaluation and comparison
  4. ‚úÖ Model selection and registration
  5. ‚úÖ Model deployment

Recommended models to try:
  ‚Ä¢ Linear Regression (baseline)
  ‚Ä¢ Ridge Regression
  ‚Ä¢ Lasso Regression
  ‚Ä¢ Random Forest Regressor
  ‚Ä¢ Gradient Boosting Regressor
  ‚Ä¢ XGBoost (if available)

All data and transformers are saved and logged to MLflow!
""")

print("="*60)


In [0]:

# MAGIC %md
# MAGIC ## 17. End MLflow Run


In [0]:

# End MLflow run
mlflow.end_run()

print("="*60)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("="*60)
print("\nüìä MLflow run ended successfully")
print("üéØ Ready for Model Training (Notebook 03)")
print("\n" + "="*60)