# House Price Prediction - AutoML with MLflow

This notebook trains 3 different AutoML models for house price prediction:
1. **XGBoost** with GridSearchCV
2. **LightGBM** with RandomizedSearchCV  
3. **Random Forest** with GridSearchCV

All models are tracked using MLflow and saved for deployment in a Streamlit app.

In [None]:
# Setup and Data Preparation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Install required packages
!pip install mlflow xgboost lightgbm

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import randint, uniform

print("All packages installed successfully!")

All packages installed successfully!


In [2]:
# Load and prepare data
df = pd.read_csv('House Price Prediction Dataset.csv')
print(f"Dataset loaded: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Data preprocessing
def prepare_data(df):
    df_clean = df.copy()
    
    # Remove outliers using IQR method
    Q1 = df_clean['Price'].quantile(0.25)
    Q3 = df_clean['Price'].quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[(df_clean['Price'] >= Q1 - 1.5 * IQR) & 
                       (df_clean['Price'] <= Q3 + 1.5 * IQR)]
    
    # Feature engineering
    current_year = 2023
    df_clean['Age'] = current_year - df_clean['YearBuilt']
    df_clean['PricePerSqFt'] = df_clean['Price'] / df_clean['Area']
    df_clean['TotalRooms'] = df_clean['Bedrooms'] + df_clean['Bathrooms']
    
    # Encode categorical variables
    categorical_cols = ['Location', 'Condition', 'Garage']
    df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)
    
    return df_encoded

# Prepare data
df_processed = prepare_data(df)
print(f"Processed data shape: {df_processed.shape}")

# Features and target
X = df_processed.drop(['Id', 'Price', 'YearBuilt'], axis=1)
y = df_processed['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Save feature names for later use
feature_names = X.columns.tolist()
print(f"Features: {len(feature_names)}")

Dataset loaded: (2000, 10)
Columns: ['Id', 'Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Location', 'Condition', 'Garage', 'Price']
Processed data shape: (2000, 17)
Training set: (1600, 14), Test set: (400, 14)
Features: 14


In [3]:
# MLflow Setup
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("house-price-automl")

print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow experiment: house-price-automl")

# Create models directory
os.makedirs("models", exist_ok=True)

# Helper function for model evaluation and logging
def evaluate_and_log_model(model, model_name, X_train, y_train, X_test, y_test, params=None):
    with mlflow.start_run(run_name=model_name):
        # Train model
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        # Log parameters
        if params:
            mlflow.log_params(params)
        
        # Log metrics
        mlflow.log_metrics({
            "mse": mse,
            "rmse": rmse,
            "r2": r2,
            "mae": mae
        })
        
        # Log model
        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        # Print results
        print(f"\n{model_name} Results:")
        print(f"MSE: {mse:.2f}")
        print(f"RMSE: {rmse:.2f}")
        print(f"R²: {r2:.4f}")
        print(f"MAE: {mae:.2f}")
        
        return {
            "model": model,
            "mse": mse,
            "rmse": rmse,
            "r2": r2,
            "mae": mae,
            "run_id": mlflow.active_run().info.run_id
        }

print("MLflow setup complete!")

MLflow tracking URI: file:./mlruns
MLflow experiment: house-price-automl
MLflow setup complete!


In [4]:
# Model 1: XGBoost with GridSearchCV
print("Training Model 1: XGBoost with GridSearchCV")

xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Train and evaluate
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

xgb_results = evaluate_and_log_model(
    best_xgb, "XGBoost_AutoML", X_train, y_train, X_test, y_test, 
    params=xgb_grid.best_params_
)

# Save model
joblib.dump(best_xgb, "models/xgboost_model.pkl")
print("XGBoost model saved!")

Training Model 1: XGBoost with GridSearchCV





XGBoost_AutoML Results:
MSE: 231013834.25
RMSE: 15199.14
R²: 0.9970
MAE: 10807.18
XGBoost model saved!


In [5]:
# Model 2: LightGBM with RandomizedSearchCV
print("Training Model 2: LightGBM with RandomizedSearchCV")

lgb_params = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.19),
    'max_depth': randint(3, 10),
    'num_leaves': randint(20, 100),
    'subsample': uniform(0.7, 0.3)
}

lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
lgb_search = RandomizedSearchCV(lgb_model, lgb_params, n_iter=20, cv=3, 
                               scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)

# Train and evaluate
lgb_search.fit(X_train, y_train)
best_lgb = lgb_search.best_estimator_

lgb_results = evaluate_and_log_model(
    best_lgb, "LightGBM_AutoML", X_train, y_train, X_test, y_test,
    params=lgb_search.best_params_
)

# Save model
joblib.dump(best_lgb, "models/lightgbm_model.pkl")
print("LightGBM model saved!")

Training Model 2: LightGBM with RandomizedSearchCV





LightGBM_AutoML Results:
MSE: 230947132.09
RMSE: 15196.94
R²: 0.9970
MAE: 11228.60
LightGBM model saved!


In [6]:
# Model 3: Random Forest with GridSearchCV
print("Training Model 3: Random Forest with GridSearchCV")

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_grid = GridSearchCV(rf_model, rf_params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Train and evaluate
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

rf_results = evaluate_and_log_model(
    best_rf, "RandomForest_AutoML", X_train, y_train, X_test, y_test,
    params=rf_grid.best_params_
)

# Save model
joblib.dump(best_rf, "models/randomforest_model.pkl")
print("Random Forest model saved!")

Training Model 3: Random Forest with GridSearchCV





RandomForest_AutoML Results:
MSE: 292540655.06
RMSE: 17103.82
R²: 0.9962
MAE: 11500.97
Random Forest model saved!


In [None]:
# Model Comparison and Summary
models_results = {
    "XGBoost": xgb_results,
    "LightGBM": lgb_results,
    "RandomForest": rf_results
}

# Create comparison DataFrame
comparison_data = []
for name, results in models_results.items():
    comparison_data.append({
        'Model': name,
        'MSE': results['mse'],
        'RMSE': results['rmse'],
        'R²': results['r2'],
        'MAE': results['mae']
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nModel Comparison:")
print(comparison_df.sort_values('RMSE'))

# Save comparison results
comparison_df.to_csv("models/model_comparison.csv", index=False)

# Save feature names
with open("models/feature_names.txt", "w") as f:
    for feature in feature_names:
        f.write(f"{feature}\n")

# Save data info for the app
data_info = {
    'feature_names': feature_names,
    'target_name': 'Price',
    'feature_ranges': {col: {'min': float(X[col].min()), 'max': float(X[col].max())} 
                      for col in feature_names}
}

import json
with open("models/data_info.json", "w") as f:
    json.dump(data_info, f)

print("\nAll models trained and saved successfully!")
print("Files saved:")
print("- models/xgboost_model.pkl")
print("- models/lightgbm_model.pkl") 
print("- models/randomforest_model.pkl")
print("- models/model_comparison.csv")
print("- models/feature_names.txt")
print("- models/data_info.json")


Model Comparison:
          Model           MSE          RMSE        R²           MAE
1      LightGBM  2.309471e+08  15196.944827  0.997031  11228.603188
0       XGBoost  2.310138e+08  15199.139260  0.997031  10807.176777
2  RandomForest  2.925407e+08  17103.819897  0.996240  11500.974892

All models trained and saved successfully!
Files saved:
- models/xgboost_model.pkl
- models/lightgbm_model.pkl
- models/randomforest_model.pkl
- models/model_comparison.csv
- models/feature_names.txt
- models/data_info.json


## Summary

Successfully trained 3 AutoML models:

1. **XGBoost** - Gradient boosting with grid search optimization
2. **LightGBM** - Fast gradient boosting with randomized search  
3. **Random Forest** - Ensemble method with grid search

All models are:
- ✅ Trained with hyperparameter tuning
- ✅ Tracked in MLflow
- ✅ Saved as pickle files
- ✅ Ready for deployment

Next step: Run the Streamlit app (`app.py`) for interactive predictions and model comparison!