In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
import category_encoders as ce
import seaborn as sns

In [2]:
import dagshub
dagshub.init(repo_owner='sourav664', repo_name='real-estate-price-prediction', mlflow=True)


In [3]:
import mlflow
mlflow.set_tracking_uri("https://dagshub.com/sourav664/real-estate-price-prediction.mlflow")

In [4]:
# mlflow experiment

mlflow.set_experiment("Exp 4 - Stacking Regressor_HP_Tuning")

<Experiment: artifact_location='mlflow-artifacts:/760b0107b85b458fac789bd242031649', creation_time=1744694464581, experiment_id='4', last_update_time=1744694464581, lifecycle_stage='active', name='Exp 4 - Stacking Regressor_HP_Tuning', tags={}>

In [2]:

from sklearn import set_config

set_config(transform_output="pandas")

In [3]:
df = pd.read_csv("../data/raw/real_estatesv9.csv")

In [4]:
df = df[['price','carpet_area','bedroom','bathroom','transaction_type','floor_category','luxury_category','property_type','regions','balconies_iter','furnished_status_imputed','additional_room']]

In [5]:
df.duplicated().sum(
    
)

0

In [6]:
df = df[~(df['price'] < 0.3)]

In [7]:
df['price'].describe()

count    17653.000000
mean         2.134978
std          2.257146
min          0.300000
25%          0.820000
50%          1.400000
75%          2.500000
max         20.000000
Name: price, dtype: float64

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
X = df.drop(columns=["price"])
y = df["price"]

In [10]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
     

In [11]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

The size of train data is (14122, 11)
The shape of test data is (3531, 11)


In [33]:
df['regions'].value_counts()

regions
bangalore        5886
navi mumbai      1426
pune             1415
noida            1394
mumbai           1369
greater noida    1359
beyond thane     1320
new delhi        1311
gurgaon          1288
thane             885
Name: count, dtype: int64

In [34]:
ohe_encode = ['transaction_type','regions','property_type']
ordinal_encode = ['luxury_category','floor_category']
# target_encode = ['regions','property_type']
std_encode = ['bedroom','bathroom','balconies_iter']
robust_encode = ['carpet_area']

In [35]:
luxury_order = ['Low', 'Medium', 'High']

floor_order = ['Low Floor', 'Mid Floor', 'High Floor']

In [14]:
from sklearn.preprocessing import FunctionTransformer

lt = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)
y_train_pt = lt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = lt.transform(y_test.values.reshape(-1,1))



In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), ohe_encode),
        ("ordinal", OrdinalEncoder(categories=[luxury_order, floor_order]), ordinal_encode),
        
        ("num", StandardScaler(), std_encode),
        ("robust", RobustScaler(), robust_encode)
    ], remainder="passthrough", n_jobs=-1, force_int_remainder_cols=False,verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [37]:
# Creating a pipeline
pipeline = Pipeline([
   
    ('preprocessor', preprocessor)
    
])

In [38]:
# do data preprocessing

X_train_trans = pipeline.fit_transform(X_train)

X_test_trans = pipeline.transform(X_test)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
import optuna
from sklearn.metrics import mean_absolute_error

In [21]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

In [18]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
from sklearn.metrics import r2_score, mean_absolute_error

In [19]:
# build the best models

best_rf_params = {'n_estimators': 469,
 'criterion': 'squared_error',                
 'max_depth': 22,
 'min_samples_split': 4,
 'min_samples_leaf': 4,
 'max_features': None,
 'max_samples': 0.891044850651133}



best_lgbm_params = {'n_estimators': 132,
 'max_depth': 27,
 'learning_rate': 0.13267634953167556,
 'subsample': 0.9901200337561764,
 'min_child_weight': 7,
 'min_split_gain': 0.04574185885539227,
 'reg_lambda': 33.921562308342544}


best_rf = RandomForestRegressor(**best_rf_params)
best_lgbm = LGBMRegressor(**best_lgbm_params)

In [25]:
def objective(trial):
    with mlflow.start_run(nested=True):
        
       
        meta_model_name = trial.suggest_categorical("model",["LR","SVM","DT"])

        if meta_model_name == "LR":
            intercept_lr = trial.suggest_categorical("intercept_lr",[True,False])
            meta = LinearRegression(fit_intercept=intercept_lr)

        elif meta_model_name == "SVM":
            kernel_svm = trial.suggest_categorical("kernel",["rbf","linear", "poly"])
            if kernel_svm == "linear":
                c_linear = trial.suggest_float("c_linear",0,10)
                meta = SVR(kernel="linear", C=c_linear)
                
            elif kernel_svm == "poly":
                c_poly = trial.suggest_float("c_poly",0,10)
                degree_poly = trial.suggest_int("degree_poly",1,5)
                meta = SVR(kernel="poly", C=c_poly, degree=degree_poly)
                
            else:
                c_rbf = trial.suggest_float("c_rbf",0,100)
                gamma_rbf = trial.suggest_float("gamma_rbf",0,10)
                meta = SVR(kernel="rbf", C=c_rbf, gamma=gamma_rbf)

        elif meta_model_name == "DT":
            max_depth_dt = trial.suggest_int("max_depth_dt",1,10)
            min_samples_split_dt = trial.suggest_int("min_samples_split_dt",2,10)
            min_samples_leaf_dt = trial.suggest_int("min_samples_leaf_dt",1,10)
            meta = DecisionTreeRegressor(max_depth=max_depth_dt,
                                        min_samples_split=min_samples_split_dt,
                                        min_samples_leaf=min_samples_leaf_dt,
                                        random_state=42)

        
        
        
     
     
        # log meta params
        mlflow.log_params(meta.get_params())
        
        
        
        
        

        # stacking regressor
        stacking_reg = StackingRegressor(estimators=[("rf",best_rf),
                                                    ("lgbm",best_lgbm)],
                                        final_estimator=meta,
                                        cv=5,n_jobs=-1)

        # build transformed regressor
        model = TransformedTargetRegressor(regressor=stacking_reg,
                                            transformer=lt)

        # train the model
        model.fit(X_train_trans,y_train)
        
        # log model params
        mlflow.log_params(model.get_params())

        # get the predictions
        y_pred_test = model.predict(X_test_trans)

        # mean absoulte error
        error = mean_absolute_error(y_test,y_pred_test)

        # log error
        mlflow.log_metric("MAE",error)

        return error

In [None]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective,n_trials=50,n_jobs=-1,show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

[I 2025-06-06 13:18:48,452] A new study created in memory with name: no-name-f4a5fe7b-b88a-4552-848b-9c6573e923e1


  0%|          | 0/50 [00:00<?, ?it/s]

🏃 View run able-crow-470 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4/runs/0ea629748ed14f50aa25cb9e5a6ef099
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4
[I 2025-06-06 13:21:43,118] Trial 4 finished with value: 0.7920241215229639 and parameters: {'model': 'DT', 'max_depth_dt': 2, 'min_samples_split_dt': 10, 'min_samples_leaf_dt': 10}. Best is trial 4 with value: 0.7920241215229639.
🏃 View run peaceful-croc-45 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4/runs/dc10da9694bb404e91fd1ed63b9696da
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/4
[I 2025-06-06 13:22:16,088] Trial 6 finished with value: 0.6800548148633287 and parameters: {'model': 'DT', 'max_depth_dt': 5, 'min_samples_split_dt': 9, 'min_samples_leaf_dt': 2}. Best is trial 6 with value: 0.6800548148633287.
🏃 View run receptive-sheep-389 

In [25]:
# best parameter value

best_params = study.best_params

best_params

{'model': 'LR',
 'intercept_lr': True,
 'model1_model': 'XGB',
 'model1_n_estimators_xgb': 37,
 'model1_learning_rate_xgb': 0.3172907307555775,
 'model1_max_depth_xgb': 14,
 'model2_model': 'XGB',
 'model2_n_estimators_xgb': 186,
 'model2_learning_rate_xgb': 0.10059734503729122,
 'model2_max_depth_xgb': 5}

In [86]:
# parameter value counts

study.trials_dataframe()["params_model"].value_counts()

params_model
LR     9
KNN    9
DT     2
Name: count, dtype: int64

In [26]:
# mean scores for each meta estimator type

study.trials_dataframe().groupby(by="params_model")['value'].mean().sort_values()

params_model
LR     0.676152
SVM    0.684364
DT     0.880643
Name: value, dtype: float64

In [88]:
# best score

study.best_value

0.624175873401006

In [27]:
# optimization history plot

optuna.visualization.plot_optimization_history(study)

In [23]:
from sklearn.linear_model import LinearRegression

In [39]:
stacking_reg = StackingRegressor(estimators=[("rf",best_rf),
                                                    ("lgbm",best_lgbm)],
                                        final_estimator=LinearRegression(fit_intercept=True),
                                        cv=5,n_jobs=-1)

# build transformed regressor
model = TransformedTargetRegressor(regressor=stacking_reg,
                                    transformer=lt)

# train the model
model.fit(X_train_trans,y_train)





In [40]:
# get the predictions
y_pred_train = model.predict(X_train_trans)
y_pred_test = model.predict(X_test_trans)



In [117]:
y_pred_train

array([0.92736509, 0.72858926, 1.52559934, ..., 3.24161242, 1.93431164,
       4.16084771])

In [41]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The train error is {mean_absolute_error(y_train,y_pred_train):.2f} cr")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test):.2f} cr")

The train error is 0.59 cr
The test error is 0.67 cr


In [42]:
print(f"The train r2 score is {r2_score(y_train,y_pred_train):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test):.2f}")

The train r2 score is 0.75
The test r2 score is 0.71
