In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
import category_encoders as ce
import seaborn as sns

In [2]:
import dagshub
dagshub.init(repo_owner='sourav664', repo_name='real-estate-price-prediction', mlflow=True)


In [3]:
import mlflow
mlflow.set_tracking_uri("https://dagshub.com/sourav664/real-estate-price-prediction.mlflow")

In [4]:
# mlflow experiment

mlflow.set_experiment("Exp 4 - Stacking Regressor_HP_Tuning")

<Experiment: artifact_location='mlflow-artifacts:/760b0107b85b458fac789bd242031649', creation_time=1744694464581, experiment_id='4', last_update_time=1744694464581, lifecycle_stage='active', name='Exp 4 - Stacking Regressor_HP_Tuning', tags={}>

In [5]:

from sklearn import set_config

set_config(transform_output="pandas")

In [None]:
df = pd.read_csv("../data/raw/real_estatesv9.csv")

In [None]:
df = df[['price','carpet_area','bedroom','bathroom','transaction_type','floor_category','luxury_category','property_type','regions','balconies_iter','furnished_status_imputed','additional_room']]

transaction_type
resale          14439
new property     3341
other              19
rent                7
Name: count, dtype: int64

In [None]:
df.duplicated().sum(
    
)

In [None]:
df.drop_duplicates(inplace=True)

In [115]:
X = df.drop(columns=["price"])
y = df["price"]

In [116]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
     

In [117]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

The size of train data is (14103, 11)
The shape of test data is (3526, 11)


In [118]:
ohe_encode = ['transaction_type']
ordinal_encode = ['luxury_category','floor_category']
target_encode = ['regions','property_type']
std_encode = ['bedroom','bathroom','balconies_iter']
robust_encode = ['carpet_area']

In [119]:
luxury_order = ['Low', 'Medium', 'High']

floor_order = ['Low Floor', 'Mid Floor', 'High Floor']

In [120]:
from sklearn.preprocessing import FunctionTransformer

lt = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)
y_train_pt = lt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = lt.transform(y_test.values.reshape(-1,1))



In [121]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), ohe_encode),
        ("ordinal", OrdinalEncoder(categories=[luxury_order, floor_order]), ordinal_encode),
        
        ("num", StandardScaler(), std_encode),
        ("robust", RobustScaler(), robust_encode)
    ], remainder="passthrough", n_jobs=-1, force_int_remainder_cols=False,verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [122]:
# Creating a pipeline
pipeline = Pipeline([
    ('target_encoder', ce.TargetEncoder(cols=target_encode)),
    ('preprocessor', preprocessor)
    
])

In [123]:
# do data preprocessing

X_train_trans = pipeline.fit_transform(X_train,y_train)

X_test_trans = pipeline.transform(X_test)

In [80]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import optuna
from sklearn.metrics import mean_absolute_error

In [81]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

In [109]:
# build the best models

best_rf_params = {'n_estimators': 469,
 'criterion': 'squared_error',                
 'max_depth': 22,
 'min_samples_split': 4,
 'min_samples_leaf': 4,
 'max_features': None,
 'max_samples': 0.891044850651133}



best_lgbm_params = {'n_estimators': 132,
 'max_depth': 27,
 'learning_rate': 0.13267634953167556,
 'subsample': 0.9901200337561764,
 'min_child_weight': 7,
 'min_split_gain': 0.04574185885539227,
 'reg_lambda': 33.921562308342544}


best_rf = RandomForestRegressor(**best_rf_params)
best_lgbm = LGBMRegressor(**best_lgbm_params)

In [83]:
def objective(trial):
    with mlflow.start_run(nested=True):
        meta_model_name = trial.suggest_categorical("model",["LR","KNN","DT"])

        if meta_model_name == "LR":
            intercept_lr = trial.suggest_categorical("intercept_lr",[True,False])
            meta = LinearRegression(fit_intercept=intercept_lr)

        elif meta_model_name == "KNN":
            n_neighbors_knn = trial.suggest_int("n_neighbors_knn",1,15)
            weights_knn = trial.suggest_categorical("weights_knn",["uniform","distance"])
            meta = KNeighborsRegressor(n_neighbors=n_neighbors_knn,
                                        weights=weights_knn,n_jobs=-1)

        elif meta_model_name == "DT":
            max_depth_dt = trial.suggest_int("max_depth_dt",1,10)
            min_samples_split_dt = trial.suggest_int("min_samples_split_dt",2,10)
            min_samples_leaf_dt = trial.suggest_int("min_samples_leaf_dt",1,10)
            meta = DecisionTreeRegressor(max_depth=max_depth_dt,
                                        min_samples_split=min_samples_split_dt,
                                        min_samples_leaf=min_samples_leaf_dt,
                                        random_state=42)

        # log meta model name
        mlflow.log_param("meta_model_name",meta_model_name)

        # stacking regressor
        stacking_reg = StackingRegressor(estimators=[("rf",best_rf),
                                                    ("lgbm",best_lgbm)],
                                        final_estimator=meta,
                                        cv=5,n_jobs=-1)

        # build transformed regressor
        model = TransformedTargetRegressor(regressor=stacking_reg,
                                            transformer=lt)

        # train the model
        model.fit(X_train_trans,y_train)

        # get the predictions
        y_pred_test = model.predict(X_test_trans)

        # mean absoulte error
        error = mean_absolute_error(y_test,y_pred_test)

        # log error
        mlflow.log_metric("MAE",error)

        return error

In [84]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective,n_trials=20,n_jobs=-1,show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

[I 2025-04-16 15:40:29,772] A new study created in memory with name: no-name-55ac2591-ca4a-4a71-be83-76c762be7e4c


  0%|          | 0/20 [00:00<?, ?it/s]

🏃 View run loud-gull-184 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4/runs/74f1dad7dd5c40ddb2553f0f57f96c06
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4
[I 2025-04-16 15:45:08,429] Trial 1 finished with value: 0.6418761982455575 and parameters: {'model': 'DT', 'max_depth_dt': 4, 'min_samples_split_dt': 9, 'min_samples_leaf_dt': 3}. Best is trial 1 with value: 0.6418761982455575.
🏃 View run gregarious-bird-713 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4/runs/f563cbb4f441404288e32cfa2ceebdcc
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4
[I 2025-04-16 15:45:29,193] Trial 5 finished with value: 0.6898579572905729 and parameters: {'model': 'KNN', 'n_neighbors_knn': 6, 'weights_knn': 'distance'}. Best is trial 1 with value: 0.6418761982455575.
🏃 View run puzzled-newt-808 at: https://dagshub.com/

In [85]:
# best parameter value

best_params = study.best_params

best_params

{'model': 'LR', 'intercept_lr': True}

In [86]:
# parameter value counts

study.trials_dataframe()["params_model"].value_counts()

params_model
LR     9
KNN    9
DT     2
Name: count, dtype: int64

In [87]:
# mean scores for each meta estimator type

study.trials_dataframe().groupby(by="params_model")['value'].mean().sort_values()

params_model
LR     0.624588
DT     0.641852
KNN    0.680586
Name: value, dtype: float64

In [88]:
# best score

study.best_value

0.624175873401006

In [27]:
# optimization history plot

optuna.visualization.plot_optimization_history(study)

In [124]:
stacking_reg = StackingRegressor(estimators=[("rf",best_rf),
                                                    ("lgbm",best_lgbm)],
                                        final_estimator=LinearRegression(),
                                        cv=5,n_jobs=-1)

# build transformed regressor
model = TransformedTargetRegressor(regressor=stacking_reg,
                                    transformer=lt)

# train the model
model.fit(X_train_trans,y_train)





In [125]:
# get the predictions
y_pred_train = model.predict(X_train_trans)
y_pred_test = model.predict(X_test_trans)



In [117]:
y_pred_train

array([0.92736509, 0.72858926, 1.52559934, ..., 3.24161242, 1.93431164,
       4.16084771])

In [126]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The train error is {mean_absolute_error(y_train,y_pred_train):.2f} cr")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test):.2f} cr")

The train error is 0.54 cr
The test error is 0.61 cr


In [127]:
print(f"The train r2 score is {r2_score(y_train,y_pred_train):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test):.2f}")

The train r2 score is 0.78
The test r2 score is 0.69
