# Model Comparison: Comparing Tree-Based Models

In this notebook, we'll create a model comparison engine  - a way to compare different decision tree-based models and feature combinations. 

**We'll focus on understanding:**

1. How to rigorously compare model performance
2. The impact of different feature engineering strategies
3. Trade-offs between model complexity and performance
4. Best practices for model evaluation and selection

## Table of Contents

1. [Introduction to Model Comparison](#introduction)
2. [Understandingthe Beast Pipeline](#understanding-beast)
3. [Feature Engineering Deep Dive](#feature-engineering)
4. [Model Comparison Framework](#comparison-framework)
5. [Results Analysis & Visualization](#results-analysis)
6. [Model Selection Guidelines](#model-selection)
7. [Production Considerations](#production)

## Introduction to Model Comparison

When developing machine learning solutions, we often need to compare different:

- Model types (Decision Trees, Random Forests, XGBoost)
- Feature engineering strategies
- Hyperparameter configurations
- Training approaches

Since we have LLM's these days we could do this in a systematic way all at once but we'll need to make sure we're doing these comparisons while ensuring:

- Fair evaluation conditions
- No data leakage
- Proper cross-validation
- Comprehensive metrics
- Statistical significance

Let's start by importing the necessary libraries and loading our data.

In [None]:
# Core data manipulation and analysis
import numpy as np
import pandas as pd

# Machine Learning - Core
from sklearn.model_selection import (
    train_test_split, 
    KFold, 
)

# Preprocessing and Encoding
from sklearn.preprocessing import (
    OneHotEncoder
)

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error, 
    r2_score,
)

# Advanced ML models
from xgboost import XGBRegressor

# System utilities
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import pickle

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Set random seeds
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load the preprocessed data
with open('../data/df_with_outcode.pkl', 'rb') as f:
    df_with_outcode = pickle.load(f)

print("Data loaded successfully!")
print(f"Shape: {df_with_outcode.shape}")
print("\nFirst few rows:")
display(df_with_outcode.head())

In [2]:
@dataclass
class FeatureSet:
    """Container for a feature set configuration"""
    X_train: pd.DataFrame
    X_val: pd.DataFrame
    y_train: pd.Series
    y_val: pd.Series
    name: str
    description: str

In [None]:
class PreProcessor:
    """Handles initial data transformations and train/test splitting"""
    
    def __init__(self, random_state: int = RANDOM_STATE):
        self.random_state = random_state

    def prepare_pre_split_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Creates features that must be calculated before train/test split"""
        df_processed = df.copy()
        
        # Log transform price
        df_processed['log_price'] = np.log(df_processed['Price'])
        
        # Create price bands for stratification
        df_processed['price_band'] = pd.qcut(df_processed['log_price'], q=10, labels=False)
        
        return df_processed
    
    def create_train_test_split(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Performs stratified train/test split using price bands"""
        train_data, test_data = train_test_split(
            df,
            test_size=0.2,
            stratify=df['price_band'],
            random_state=self.random_state
        )
        
        return train_data, test_data

print("PreProcessor class loaded successfully!")

In [4]:
class FeatureEncoder:
    """Handles all feature engineering and encoding with fold awareness"""
    
    def __init__(self, smoothing_factor: int = 10, min_location_freq: int = 5, random_state: int = RANDOM_STATE):
        self.smoothing_factor = smoothing_factor
        self.min_location_freq = min_location_freq
        self.random_state = random_state

    def _calculate_outcode_price_per_sqft(self,
                                        fold_train: pd.DataFrame,
                                        fold_val: pd.DataFrame) -> Dict[str, pd.Series]:
        """
        Calculate mean price per square foot using out-of-fold means for outcodes
        
        Args:
            fold_train: Training data for current fold
            fold_val: Validation data for current fold
            
        Returns:
            Dictionary containing train and validation series of outcode mean price per sqft
        """
        # Initialize empty series for OOF predictions
        oof_price_per_sqft = pd.Series(index=fold_train.index, dtype='float64')
        
        # Calculate OOF means for training data
        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
        for train_idx, val_idx in kf.split(fold_train):
            inner_train = fold_train.iloc[train_idx]
            inner_val = fold_train.iloc[val_idx]
            
            # Calculate price per sqft for inner training set
            inner_price_per_sqft = inner_train['Price'] / inner_train['Area in sq ft']
            outcode_means = inner_price_per_sqft.groupby(inner_train['Outcode']).mean()
            global_mean = inner_price_per_sqft.mean()
            
            # Apply to inner validation set
            oof_price_per_sqft.iloc[val_idx] = (
                inner_val['Outcode']
                .map(outcode_means)
                .fillna(global_mean)
            )
        
        # Calculate means for validation data using full training set
        train_price_per_sqft = fold_train['Price'] / fold_train['Area in sq ft']
        outcode_means = train_price_per_sqft.groupby(fold_train['Outcode']).mean()
        global_mean = train_price_per_sqft.mean()
        
        val_price_per_sqft = (
            fold_val['Outcode']
            .map(outcode_means)
            .fillna(global_mean)
        )
        
        return {
            'train': oof_price_per_sqft,
            'val': val_price_per_sqft
        }

    def _encode_house_type(self,
                          fold_train: pd.DataFrame,
                          fold_val: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """Create one-hot encoding for house type"""
        # Initialize encoder for this fold
        house_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
        # Fit on fold's training data
        train_encoded = pd.DataFrame(
            house_encoder.fit_transform(fold_train[['House Type']]),
            columns=house_encoder.get_feature_names_out(['House Type']),
            index=fold_train.index
        )
        
        # Transform validation data
        val_encoded = pd.DataFrame(
            house_encoder.transform(fold_val[['House Type']]),
            columns=house_encoder.get_feature_names_out(['House Type']),
            index=fold_val.index
        )
        
        return {
            'train': train_encoded,
            'val': val_encoded
        }

    def _encode_city_country(self,
                           fold_train: pd.DataFrame,
                           fold_val: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """Create one-hot encoding for city/county"""
        # Initialize encoder for this fold
        city_country_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
        # Fit on fold's training data
        train_encoded = pd.DataFrame(
            city_country_encoder.fit_transform(fold_train[['City/County']]),
            columns=city_country_encoder.get_feature_names_out(['City/County']),
            index=fold_train.index
        )
        
        # Transform validation data
        val_encoded = pd.DataFrame(
            city_country_encoder.transform(fold_val[['City/County']]),
            columns=city_country_encoder.get_feature_names_out(['City/County']),
            index=fold_val.index
        )
        
        return {
            'train': train_encoded,
            'val': val_encoded
        }

    def _encode_outcode_onehot(self,
                              fold_train: pd.DataFrame,
                              fold_val: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """Create one-hot encoding for outcodes"""
        outcode_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
        train_encoded = pd.DataFrame(
            outcode_encoder.fit_transform(fold_train[['Outcode']]),
            columns=outcode_encoder.get_feature_names_out(['Outcode']),
            index=fold_train.index
        )
        
        val_encoded = pd.DataFrame(
            outcode_encoder.transform(fold_val[['Outcode']]),
            columns=outcode_encoder.get_feature_names_out(['Outcode']),
            index=fold_val.index
        )
        
        return {
            'train': train_encoded,
            'val': val_encoded
        }

    def _encode_outcode_postcode_location_target_hierarchical(self,
                                                            fold_train: pd.DataFrame,
                                                            fold_val: pd.DataFrame
                                                            ) -> Tuple[Dict[str, pd.Series],
                                                                     Dict[str, pd.Series],
                                                                     Dict[str, pd.Series]]:
        """
        Create hierarchical target encoding for geographic features:
        - Outcode encoding
        - Postcode encoding using outcode as prior
        - Location encoding using postcode as prior
        
        Returns:
            Tuple of (outcode_encoding, postcode_encoding, location_encoding)
        """
        # 1. Outcode encoding
        outcode_encoding = self._encode_outcode_target(fold_train, fold_val)
        
        # 2. Postcode encoding using outcode as prior
        postcode_encoding = self._encode_postcode_target(
            fold_train, 
            fold_val, 
            outcode_encoding
        )
        
        # 3. Location encoding using postcode as prior
        location_encoding = self._encode_location_target(
            fold_train,
            fold_val,
            postcode_encoding
        )
        
        return outcode_encoding, postcode_encoding, location_encoding

    def _encode_outcode_target(self,
                             train_data: pd.DataFrame,
                             eval_data: pd.DataFrame) -> Dict[str, pd.Series]:
        """Create target encoding for outcodes"""
        if 'cv_fold' in train_data.columns:  # We're in cross-validation
            # Use out-of-fold encoding for training data
            oof_predictions = pd.Series(index=train_data.index, dtype='float64')
            
            kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)
            for inner_train_idx, inner_val_idx in kf.split(train_data):
                inner_train = train_data.iloc[inner_train_idx]
                inner_val = train_data.iloc[inner_val_idx]
                
                outcode_means = inner_train.groupby('Outcode')['log_price'].mean()
                global_mean = inner_train['log_price'].mean()
                
                oof_predictions.iloc[inner_val_idx] = (
                    inner_val['Outcode']
                    .map(outcode_means)
                    .fillna(global_mean)
                )
            
            # For validation data, use means from all training data
            outcode_means = train_data.groupby('Outcode')['log_price'].mean()
            global_mean = train_data['log_price'].mean()
            
            val_encoded = (
                eval_data['Outcode']
                .map(outcode_means)
                .fillna(global_mean)
            )
            
            return {
                'train': oof_predictions,
                'val': val_encoded
            }
            
        else:  # We're encoding for the test set
            # Use all training data to encode test set
            outcode_means = train_data.groupby('Outcode')['log_price'].mean()
            global_mean = train_data['log_price'].mean()
            
            test_encoded = (
                eval_data['Outcode']
                .map(outcode_means)
                .fillna(global_mean)
            )
            
            return {
                'train': train_data['Outcode'].map(outcode_means).fillna(global_mean),
                'val': test_encoded
            }
    
    def _encode_postcode_target(self,
                              fold_train: pd.DataFrame,
                              fold_val: pd.DataFrame,
                              outcode_encoding: Dict[str, pd.Series]) -> Dict[str, pd.Series]:
        """Create hierarchical encoding for postcodes using outcode prior"""
        postcode_means = fold_train.groupby('Postal Code')['log_price'].mean()
        postcode_counts = fold_train['Postal Code'].value_counts()
        
        def encode_postcodes(df: pd.DataFrame, outcode_encoded: pd.Series) -> pd.Series:
            counts = df['Postal Code'].map(postcode_counts)
            means = df['Postal Code'].map(postcode_means)
            
            # Handle unseen categories using outcode encoding
            means = means.fillna(outcode_encoded)
            counts = counts.fillna(0)
            
            # Calculate smoothed values
            weight = counts / (counts + self.smoothing_factor)
            return weight * means + (1 - weight) * outcode_encoded
        
        return {
            'train': encode_postcodes(fold_train, outcode_encoding['train']),
            'val': encode_postcodes(fold_val, outcode_encoding['val'])
        }
    
    def _encode_location_target(self,
                              fold_train: pd.DataFrame,
                              fold_val: pd.DataFrame,
                              postcode_encoding: Dict[str, pd.Series]) -> Dict[str, pd.Series]:
        """Create hierarchical encoding for locations using postcode prior"""
        location_means = fold_train.groupby('Location')['log_price'].mean()
        location_counts = fold_train['Location'].value_counts()
        
        def encode_locations(df: pd.DataFrame, postcode_encoded: pd.Series) -> pd.Series:
            counts = df['Location'].map(location_counts)
            means = df['Location'].map(location_means)
            
            # Handle missing and unseen locations using postcode encoding
            means = means.fillna(postcode_encoded)
            counts = counts.fillna(0)
            
            # Use postcode encoding for low-frequency locations
            low_freq_mask = (counts < self.min_location_freq) | counts.isna()
            
            # Calculate smoothed values
            weight = counts / (counts + self.smoothing_factor)
            encoded = weight * means + (1 - weight) * postcode_encoded
            
            # Replace low frequency locations with postcode encoding
            encoded[low_freq_mask] = postcode_encoded[low_freq_mask]
            
            return encoded
        
        return {
            'train': encode_locations(fold_train, postcode_encoding['train']),
            'val': encode_locations(fold_val, postcode_encoding['val'])
        }

    def create_fold_features(self, fold_train: pd.DataFrame, fold_val: pd.DataFrame) -> List[FeatureSet]:
        """Create all feature set variations for a fold"""
        
        house_features = self._encode_house_type(fold_train, fold_val)
        city_country_features = self._encode_city_country(fold_train, fold_val)
        
        # Exploded geographic features with hierarchical encoding
        outcode_target_hierarchical, postcode_target_hierarchical, location_target_hierarchical = (
            self._encode_outcode_postcode_location_target_hierarchical(fold_train, fold_val)
        )
        
        outcode_onehot = self._encode_outcode_onehot(fold_train, fold_val)
        outcode_price_per_sqft = self._calculate_outcode_price_per_sqft(fold_train, fold_val)
        
        feature_combinations = [
            # Base features
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': None,
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms',
                'desc': 'Area in sq ft, No. of Bedrooms'
            },
            # Single feature additions
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': house_features,
                'city': None,
                'geo_target': None,
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_house',
                'desc': 'Area in sq ft, No. of Bedrooms, House Type'
            },
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': city_country_features,
                'geo_target': None,
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_city',
                'desc': 'Area in sq ft, No. of Bedrooms, City/County'
            },
            # Individual geographic features - Target encoded
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': {'outcode': outcode_target_hierarchical},
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_outcode_target',
                'desc': 'Area in sq ft, No. of Bedrooms, Outcode (Target)'
            },
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': {'postcode': postcode_target_hierarchical},
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_postcode_target',
                'desc': 'Area in sq ft, No. of Bedrooms, Postcode (Target)'
            },
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': {'location': location_target_hierarchical},
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_location_target',
                'desc': 'Area in sq ft, No. of Bedrooms, Location (Target)'
            },
            # One-hot encoded outcode
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': None,
                'geo_onehot': {'outcode': outcode_onehot},
                'price_sqft': None,
                'name': 'area_bedrooms_outcode_onehot',
                'desc': 'Area in sq ft, No. of Bedrooms, Outcode (One-hot)'
            },
            # Price per square foot
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': None,
                'geo_onehot': None,
                'price_sqft': outcode_price_per_sqft,
                'name': 'area_bedrooms_pricesqft',
                'desc': 'Area in sq ft, No. of Bedrooms, Price/sqft'
            },
            # Two feature combinations
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': house_features,
                'city': city_country_features,
                'geo_target': None,
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_house_city',
                'desc': 'Area in sq ft, No. of Bedrooms, House Type, City/County'
            },
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': {
                    'outcode': outcode_target_hierarchical,
                    'postcode': postcode_target_hierarchical
                },
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_outcode_postcode_target',
                'desc': 'Area in sq ft, No. of Bedrooms, Outcode & Postcode (Target)'
            },
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': {
                    'postcode': postcode_target_hierarchical,
                    'location': location_target_hierarchical
                },
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_postcode_location_target',
                'desc': 'Area in sq ft, No. of Bedrooms, Postcode & Location (Target)'
            },
            # Three feature combinations
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': house_features,
                'city': city_country_features,
                'geo_target': {'outcode': outcode_target_hierarchical},
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_house_city_outcode_target',
                'desc': 'Area in sq ft, No. of Bedrooms, House Type, City/County, Outcode (Target)'
            },
            # All geographic features
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': None,
                'city': None,
                'geo_target': {
                    'outcode': outcode_target_hierarchical,
                    'postcode': postcode_target_hierarchical,
                    'location': location_target_hierarchical
                },
                'geo_onehot': None,
                'price_sqft': None,
                'name': 'area_bedrooms_all_geo_target',
                'desc': 'Area in sq ft, No. of Bedrooms, All Geographic Features (Target)'
            },
            # Complex combinations
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': house_features,
                'city': None,
                'geo_target': {'outcode': outcode_target_hierarchical},
                'geo_onehot': None,
                'price_sqft': outcode_price_per_sqft,
                'name': 'area_bedrooms_house_outcode_target_pricesqft',
                'desc': 'Area in sq ft, No. of Bedrooms, House Type, Outcode (Target), Price/sqft'
            },
            # All features
            {
                'numeric': ['Area in sq ft', 'No. of Bedrooms'],
                'house': house_features,
                'city': city_country_features,
                'geo_target': {
                    'outcode': outcode_target_hierarchical,
                    'postcode': postcode_target_hierarchical,
                    'location': location_target_hierarchical
                },
                'geo_onehot': None,
                'price_sqft': outcode_price_per_sqft,
                'name': 'all_features',
                'desc': 'All Features Combined'
            }
        ]
        
        return [self._combine_features(
            fold_train, 
            fold_val,
            combo['numeric'],
            combo['house'],
            combo['city'],
            combo['geo_target'],
            combo['geo_onehot'],
            combo['price_sqft'],
            combo['name'],
            combo['desc']
        ) for combo in feature_combinations]
    
    def _combine_features(self,
                         fold_train: pd.DataFrame,
                         fold_val: pd.DataFrame,
                         base_numeric: List[str],
                         house_features: Optional[Dict[str, pd.DataFrame]],
                         city_country_features: Optional[Dict[str, pd.DataFrame]],
                         geo_target_features: Optional[Dict[str, Dict[str, pd.Series]]],
                         geo_onehot_features: Optional[Dict[str, Dict[str, pd.DataFrame]]],
                         price_sqft_features: Optional[Dict[str, pd.Series]],
                         name: str,
                         description: str) -> FeatureSet:
        """
        Combine different feature types into a single feature set
        """
        # Start with base numeric features
        X_train = fold_train[base_numeric].copy()
        X_val = fold_val[base_numeric].copy()
        
        # Add house type features if provided
        if house_features:
            X_train = pd.concat([X_train, house_features['train']], axis=1)
            X_val = pd.concat([X_val, house_features['val']], axis=1)

        # Add city/country features if provided
        if city_country_features:
            X_train = pd.concat([X_train, city_country_features['train']], axis=1)
            X_val = pd.concat([X_val, city_country_features['val']], axis=1)
        
        # Add target-encoded geographic features if provided
        if geo_target_features:
            for feature_name, feature_dict in geo_target_features.items():
                X_train[feature_name] = feature_dict['train']
                X_val[feature_name] = feature_dict['val']
        
        # Add one-hot encoded geographic features if provided
        if geo_onehot_features:
            for feature_name, feature_dict in geo_onehot_features.items():
                X_train = pd.concat([X_train, feature_dict['train']], axis=1)
                X_val = pd.concat([X_val, feature_dict['val']], axis=1)
        
        # Add price per square foot features if provided
        if price_sqft_features:
            X_train['outcode_price_per_sqft'] = price_sqft_features['train']
            X_val['outcode_price_per_sqft'] = price_sqft_features['val']
        
        return FeatureSet(
            X_train=X_train,
            X_val=X_val,
            y_train=fold_train['log_price'],
            y_val=fold_val['log_price'],
            name=name,
            description=description
        )

In [None]:
class CrossValidator:
    """Handles cross-validation and model evaluation"""
    
    def __init__(self, n_folds: int = 5, random_state: int = RANDOM_STATE):
        self.n_folds = n_folds
        self.random_state = random_state
        self.models = {
            'decision_tree': DecisionTreeRegressor(random_state=random_state),
            'random_forest': RandomForestRegressor(
                n_estimators=100, 
                random_state=random_state
            ),
            'xgboost': XGBRegressor(
                n_estimators=100, 
                random_state=random_state
            )
        }

    def evaluate_all_combinations(self, train_data: pd.DataFrame, test_data: pd.DataFrame) -> pd.DataFrame:
        """Evaluate all feature set and model combinations"""
        results = []
        encoder = FeatureEncoder()
        
        # K-fold cross-validation
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        
        for fold_idx, (fold_train_idx, fold_val_idx) in enumerate(kf.split(train_data)):
            # Get fold data
            fold_train = train_data.iloc[fold_train_idx].copy()
            fold_val = train_data.iloc[fold_val_idx].copy()
            
            # Mark as CV fold
            fold_train['cv_fold'] = fold_idx
            fold_val['cv_fold'] = fold_idx
            
            # Create features
            feature_sets = encoder.create_fold_features(fold_train, fold_val)
            
            # Evaluate combinations
            for feature_set in feature_sets:
                for model_name, model in self.models.items():
                    # Train and evaluate
                    model.fit(feature_set.X_train, feature_set.y_train)
                    fold_val_pred = model.predict(feature_set.X_val)
                    
                    # Calculate metrics
                    results.append({
                        'fold': fold_idx,
                        'feature_set': feature_set.name,
                        'description': feature_set.description,
                        'model': model_name,
                        'split_type': 'cv_fold',
                        'rmse': self._calculate_rmse(feature_set.y_val, fold_val_pred),
                        'r2': r2_score(feature_set.y_val, fold_val_pred),
                        'mae': mean_absolute_error(
                            np.exp(feature_set.y_val), 
                            np.exp(fold_val_pred)
                        ),
                        'pct_mae': np.mean(np.abs(
                            (np.exp(feature_set.y_val) - np.exp(fold_val_pred)) / 
                            np.exp(feature_set.y_val)
                        )) * 100,
                        'n_features': feature_set.X_train.shape[1]
                    })
        
        # Final evaluation on test set
        train_data = train_data.drop('cv_fold', axis=1, errors='ignore')
        final_feature_sets = encoder.create_fold_features(train_data, test_data)
        
        for feature_set in final_feature_sets:
            for model_name, model in self.models.items():
                model.fit(feature_set.X_train, feature_set.y_train)
                test_pred = model.predict(feature_set.X_val)
                
                results.append({
                    'fold': 'final',
                    'feature_set': feature_set.name,
                    'description': feature_set.description,
                    'model': model_name,
                    'split_type': 'test',
                    'rmse': self._calculate_rmse(feature_set.y_val, test_pred),
                    'r2': r2_score(feature_set.y_val, test_pred),
                    'mae': mean_absolute_error(
                        np.exp(feature_set.y_val), 
                        np.exp(test_pred)
                    ),
                    'pct_mae': np.mean(np.abs(
                        (np.exp(feature_set.y_val) - np.exp(test_pred)) / 
                        np.exp(feature_set.y_val)
                    )) * 100,
                    'n_features': feature_set.X_train.shape[1]
                })
        
        return pd.DataFrame(results)
    
    def _calculate_rmse(self, y_true: pd.Series, y_pred: np.ndarray) -> float:
        """Calculate Root Mean Squared Error"""
        return np.sqrt(np.mean((y_true - y_pred) ** 2))

print("CrossValidator class loaded successfully!")

## Running the Model Comparison Pipeline

Now let's run our comparison pipeline to evaluate different model and feature combinations:

In [6]:
def run_model_comparison_pipeline(df_with_outcode: pd.DataFrame) -> pd.DataFrame:
    """Run complete pipeline from raw data to model comparison"""
    
    preprocessor = PreProcessor()
    
    # Create pre-split features
    df_processed = preprocessor.prepare_pre_split_features(df_with_outcode)
    
    # Create initial train/test split
    train_data, test_data = preprocessor.create_train_test_split(df_processed)
    
    # Run cross-validation evaluation
    validator = CrossValidator()
    results = validator.evaluate_all_combinations(train_data, test_data)
    
    return results

# Run pipeline
results = run_model_comparison_pipeline(df_with_outcode)

In [None]:
def display_results(results: pd.DataFrame) -> None:
    """
    Display model performance summary with cross-validation and test results.
    
    Args:
        results: DataFrame containing model evaluation results with columns:
                feature_set, model, split_type, r2, rmse, mae, pct_mae, description
    """
    print("\nModel Performance Summary:")
    print("-" * 170)

    # Print header
    header = "Features - Model".ljust(100) + " "
    header += "CV R²".ljust(15)
    header += "CV RMSE".ljust(15)
    header += "CV MAE (£)".ljust(20)
    header += "CV %Error".ljust(20)
    print(header)
    print("-" * 170)

    for (feature_set, model), group in results.groupby(['feature_set', 'model']):
        cv_results = group[group['split_type'] == 'cv_fold']
        test_results = group[group['split_type'] == 'test'].iloc[0]
        
        # Create feature_model string using description
        feature_model = f"{test_results['description']} - {model}"
        
        # Print CV results
        cv_line = feature_model.ljust(100) + " "
        cv_line += f"{cv_results['r2'].mean():.3f} ±{cv_results['r2'].std():.3f}".ljust(15)
        cv_line += f"{cv_results['rmse'].mean():.3f} ±{cv_results['rmse'].std():.3f}".ljust(15)
        cv_line += f"£{cv_results['mae'].mean():,.0f} ±{cv_results['mae'].std():,.0f}".ljust(20)
        cv_line += f"{cv_results['pct_mae'].mean():.1f} ±{cv_results['pct_mae'].std():.1f}%"
        print(cv_line)
        
        # Print test results (indented)
        test_line = "→ Test Results".ljust(100) + " "
        test_line += f"{test_results['r2']:.3f}".ljust(15)
        test_line += f"{test_results['rmse']:.3f}".ljust(15)
        test_line += f"£{test_results['mae']:,.0f}".ljust(20)
        test_line += f"{test_results['pct_mae']:.1f}%"
        print(test_line)

# Usage:
display_results(results)