In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

In [3]:
# Load and preprocess the data
def load_and_preprocess_data():
    """
    Load California Housing dataset and perform preprocessing steps
    """
    # Load the dataset
    housing = fetch_california_housing()
    
    # Convert to DataFrame
    df = pd.DataFrame(housing.data, columns=housing.feature_names)
    df['target'] = housing.target
    
    # Check for missing values
    print("Missing values:\n", df.isnull().sum())
    
    # Split features and target
    X = df.drop('target', axis=1)
    y = df['target']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [5]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Train a model and evaluate its performance
    """
    # Time the training
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'Model': model_name,
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Training Time': training_time
    }

In [7]:
def main():
    # Load and preprocess data
    print("Loading and preprocessing data...")
    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'SVR': SVR(kernel='rbf')
    }
    
    # Train and evaluate all models
    results = []
    for name, model in models.items():
        print(f"\nTraining {name}...")
        result = train_and_evaluate_model(model, X_train, X_test, y_train, y_test, name)
        results.append(result)
    
    # Create results DataFrame
    results_df = pd.DataFrame(results)
    print("\nModel Comparison Results:")
    print(results_df)
    
    # Identify best and worst models
    best_model = results_df.loc[results_df['R2'].idxmax()]
    worst_model = results_df.loc[results_df['R2'].idxmin()]
    
    print(f"\nBest performing model: {best_model['Model']}")
    print(f"R² Score: {best_model['R2']:.4f}")
    print(f"\nWorst performing model: {worst_model['Model']}")
    print(f"R² Score: {worst_model['R2']:.4f}")

In [9]:
if __name__ == "__main__":
    main()

Loading and preprocessing data...
Missing values:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

Training Linear Regression...

Training Decision Tree...

Training Random Forest...

Training Gradient Boosting...

Training SVR...

Model Comparison Results:
               Model       MSE       MAE        R2  Training Time
0  Linear Regression  0.555892  0.533200  0.575788       0.131096
1      Decision Tree  0.493969  0.453904  0.623042       0.241213
2      Random Forest  0.255170  0.327425  0.805275      15.361919
3  Gradient Boosting  0.293999  0.371650  0.775643       4.591528
4                SVR  0.357004  0.398599  0.727563      11.342581

Best performing model: Random Forest
R² Score: 0.8053

Worst performing model: Linear Regression
R² Score: 0.5758
