In [0]:
# ‚úÖ Check and install required packages if missing
restart_required = False

try:
    import databricks.feature_engineering
except ImportError:
    %pip install databricks-feature-engineering
    restart_required = True

try:
    import xgboost
except ImportError:
    %pip install xgboost
    restart_required = True

try:
    import lightgbm
except ImportError:
    %pip install lightgbm
    restart_required = True

try:
    import catboost
except ImportError:
    %pip install catboost
    restart_required = True

# üîÅ Restart Python only if any package was installed
if restart_required:
    dbutils.library.restartPython()


In [0]:
# Imports
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import mlflow
import mlflow.sklearn
from mlflow.exceptions import MlflowException
import pandas as pd
import warnings


warnings.filterwarnings("ignore")


# Initialize FeatureEngineeringClient
fe = FeatureEngineeringClient()

# Use your catalog/schema
spark.sql("USE CATALOG realestate")
spark.sql("USE SCHEMA ml")

# Load labels (House_ID and Price)
base_file_path = "file:/Workspace/Users/yasodhashree91@gmail.com/oms-databricks/04_AI_ML/model_evaluation/regression/data_files/"
labels_df = spark.read.csv(base_file_path + "housing.csv", header=True, inferSchema=True)\
    .select("House_ID", "Price")\
    .dropna(subset=["Price"])  # drop rows where Price is null


# Define feature lookup from feature store
feature_lookups = [
    FeatureLookup(
        table_name="realestate.ml.housing_features",
        lookup_key="House_ID"
    )
]

# Create training set with features joined
training_set = fe.create_training_set(
    df=labels_df,
    feature_lookups=feature_lookups,
    label="Price",
    exclude_columns=["House_ID"]
)

# Load to pandas DataFrame
data = training_set.load_df().toPandas()

# Split into X and y
X = data.drop(columns=["Price"])            # Features (X)
y = data["Price"]                           # Label (y)


In [0]:


# Split Features (X) and Label (y) into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Model setup and configurations
model_variants = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
    "LightGBM": lgb.LGBMRegressor(max_depth=5, min_gain_to_split=0, verbosity=-1, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=100, depth=5, learning_rate=0.1, random_seed=42, verbose=False, allow_writing_files=False)
}

for name, model in model_variants.items():
    with mlflow.start_run(run_name=name):
        # Train
        model.fit(X_train, y_train)

        # Predict
        preds = model.predict(X_test)

        # Calculate metrics
        mae = mean_absolute_error(y_test, preds)
        mse = mean_squared_error(y_test, preds)
        rmse = mean_squared_error(y_test, preds, squared=False)
        r2 = r2_score(y_test, preds)

        # Log model with feature store info
        fe.log_model(
            model=model,
            artifact_path=name.replace(" ", "_").lower(),
            flavor=mlflow.sklearn,
            training_set=training_set,
            registered_model_name="housing_price_model"
        )

    print(f"Model Variant: {name}")
    print(f"Mean Absolute Error (MAE): {mae:.16f}")
    print(f"Mean Squared Error (MSE): {mse:.16f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.16f}")   
    print(f"R-squared (R¬≤): {r2:.16f}")

    print("=" * 100)


| Metric | Full Form               | Better When | What It Means                                                                                  |
|--------|-------------------------|-------------|-----------------------------------------------------------------------------------------------|
| MAE    | Mean Absolute Error     | Lower       | The average size of the errors, showing how far predictions are from actual values on average |
| MSE    | Mean Squared Error      | Lower       | Like MAE, but bigger errors are penalized more because it squares the differences before averaging     |
| RMSE   | Root Mean Squared Error | Lower       | The square root of MSE, so errors are back in the original units, making them easier to understand |
| R¬≤     | R-squared               | Higher      | Shows how closely the model‚Äôs predictions follow the real data patterns; 1 is perfect, 0 means no relation. |

