In [1]:
# Core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Models
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet,
    BayesianRidge, HuberRegressor, RANSACRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    BaggingRegressor, AdaBoostRegressor, VotingRegressor
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# MLflow (LOCAL logging — no server)
import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)


  import pkg_resources  # noqa: TID251
* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
df = pd.read_csv("../data/clean/train_clean.csv")

TARGET = "SalePrice"
X = df.drop(columns=[TARGET])
y = df[TARGET]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [4]:
num_cols = X.select_dtypes(exclude="object").columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Ordinal columns (QUALITY-based)
ordinal_cols = [
    "ExterQual",
    "KitchenQual",
    "HeatingQC",
    "BsmtQual",
    "GarageQual"
]


In [5]:
ord_categories = [["None", "Po", "Fa", "TA", "Gd", "Ex"]] * len(ordinal_cols)

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ord_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("encoder", OrdinalEncoder(categories=ord_categories, dtype=float))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("ord", ord_pipeline, ordinal_cols),
        ("cat", cat_pipeline, [c for c in cat_cols if c not in ordinal_cols])
    ],
    remainder="drop"
)


In [6]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5),
    "BayesianRidge": BayesianRidge(),
    "HuberRegressor": HuberRegressor(),

    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),

    "KNN": KNeighborsRegressor(),
    "SVR": SVR(),
    "MLP": MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=500, random_state=42),

    "Bagging": BaggingRegressor(n_estimators=50, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=50, random_state=42),
    "Voting": VotingRegressor([
        ("lr", LinearRegression()),
        ("rf", RandomForestRegressor(n_estimators=50, random_state=42))
    ])
}


In [7]:
mlflow.set_experiment("house_price_experiments")


Traceback (most recent call last):
  File "d:\conda_envs\ml_house_price_project\lib\site-packages\mlflow\store\tracking\file_store.py", line 302, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "d:\conda_envs\ml_house_price_project\lib\site-packages\mlflow\store\tracking\file_store.py", line 395, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "d:\conda_envs\ml_house_price_project\lib\site-packages\mlflow\store\tracking\file_store.py", line 1303, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "d:\conda_envs\ml_house_price_project\lib\site-packages\mlflow\store\tracking\file_store.py", line 1296, in _read_helper
    result = read_yaml(root, file_name)
  File "d:\conda_envs\ml_house_price_project\lib\site-packages\mlflow\utils\file_utils.py", line 303, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist.")
mlflow.exceptions.

<Experiment: artifact_location='file:///d:/Ml_Projects/ml_house_price_project/notebooks/mlruns/124443566973195403', creation_time=1767581163221, experiment_id='124443566973195403', last_update_time=1767581163221, lifecycle_stage='active', name='house_price_experiments', tags={}>

In [None]:

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()

    # Start MLflow run
    with mlflow.start_run(run_name=name):
        pipe = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model)
        ])

        try:
            # Fit and predict
            pipe.fit(X_train, y_train)
            preds = pipe.predict(X_test)

            # Metrics
            rmse = np.sqrt(mean_squared_error(y_test, preds))
            mae = mean_absolute_error(y_test, preds)
            r2 = r2_score(y_test, preds)
            elapsed = time.time() - start_time

            # MLflow logs
            mlflow.log_param("model_name", name)
            mlflow.log_param("n_train", len(X_train))
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("time_sec", elapsed)

            # Log model with correct parameter
            mlflow.sklearn.log_model(pipe, artifact_path="model")

            # Save results
            results.append({
                "Model": name,
                "RMSE": rmse,
                "MAE": mae,
                "R2": r2
            })

            print(f"{name} → RMSE={rmse:.2f}, MAE={mae:.2f}, R2={r2:.3f}")

        except Exception as e:
            print(f"{name} failed: {e}")



Training LinearRegression...


In [None]:
results_df = pd.DataFrame(results).sort_values(by="RMSE")
results_df



: 

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x="RMSE", y="Model", data=results_df)
plt.title("RMSE Comparison (Lower is Better)")
plt.show()


In [None]:
# Save results for reference
results_df.to_csv("../reports/experiment_results.csv", index=False)


### Model Selection Decision

- Best performing model: **GradientBoostingRegressor**
- RMSE ≈ **25k**
- Strong bias–variance tradeoff
- Stable performance compared to ensembles and neural models

➡️ **GradientBoostingRegressor will be used in `train.py`**
