In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
import category_encoders as ce

In [2]:
import dagshub
dagshub.init(repo_owner='sourav664', repo_name='real-estate-price-prediction', mlflow=True)


In [3]:
import mlflow
mlflow.set_tracking_uri("https://dagshub.com/sourav664/real-estate-price-prediction.mlflow")

In [4]:
# mlflow experiment

mlflow.set_experiment("Exp 3 - LGBM_HP_Tuning")

<Experiment: artifact_location='mlflow-artifacts:/49c8bd5101d9444b81323ae964b315e0', creation_time=1744439756447, experiment_id='3', last_update_time=1744439756447, lifecycle_stage='active', name='Exp 3 - LGBM_HP_Tuning', tags={}>

In [5]:

from sklearn import set_config

set_config(transform_output="pandas")

In [6]:
df = pd.read_csv("../data/raw/real_estatesv9.csv")

In [7]:
df = df[['price','carpet_area','bedroom','bathroom','transaction_type','floor_category','luxury_category','property_type','regions','balconies_iter','furnished_status_imputed','additional_room']]

In [8]:
df.duplicated().sum(
    
)

0

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
X = df.drop(columns=["price"])
y = df["price"]

In [11]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
     

In [12]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

The size of train data is (14256, 11)
The shape of test data is (3564, 11)


In [13]:
ohe_encode = ['transaction_type']
ordinal_encode = ['luxury_category','floor_category']
target_encode = ['regions','property_type']
std_encode = ['bedroom','bathroom','balconies_iter']
robust_encode = ['carpet_area']

In [14]:
luxury_order = ['Low', 'Medium', 'High']

floor_order = ['Low Floor', 'Mid Floor', 'High Floor']

In [15]:
from sklearn.preprocessing import FunctionTransformer

pt = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)
y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))



In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), ohe_encode),
        ("ordinal", OrdinalEncoder(categories=[luxury_order, floor_order]), ordinal_encode),
        
        ("num", StandardScaler(), std_encode),
        ("robust", RobustScaler(), robust_encode)
    ], remainder="passthrough", n_jobs=-1, force_int_remainder_cols=False,verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [17]:
# Creating a pipeline
pipeline = Pipeline([
    ('target_encoder', ce.TargetEncoder(cols=target_encode)),
    ('preprocessor', preprocessor)
    
])

In [18]:
# do data preprocessing

X_train_trans = pipeline.fit_transform(X_train, y_train)

X_test_trans = pipeline.transform(X_test)

In [19]:
from lightgbm import LGBMRegressor
import optuna
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [20]:
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            "n_estimators": trial.suggest_int("n_estimators",10,200),
            "max_depth": trial.suggest_int("max_depth",1,40),
            "learning_rate": trial.suggest_float("learning_rate",0.01,0.8),
            "subsample": trial.suggest_float("subsample",0.5,1),
            "min_child_weight": trial.suggest_int("min_child_weight",5,20),
            "min_split_gain": trial.suggest_float("min_split_gain",0,10),
            "reg_lambda": trial.suggest_float("reg_lambda",0,100),
            "random_state": 42,
            "n_jobs": -1,
        }

        # log model parameters
        mlflow.log_params(params)

        xgb_reg = LGBMRegressor(**params)
        model = TransformedTargetRegressor(regressor=xgb_reg,transformer=pt)

        # train the model
        model.fit(X_train_trans,y_train)

        # get the predictions
        y_pred_train = model.predict(X_train_trans)
        y_pred_test = model.predict(X_test_trans)


        # perform cross validation
        cv_score = cross_val_score(model,
                                X_train_trans,
                                y_train,
                                cv=5,
                                scoring="neg_mean_absolute_error",
                                n_jobs=-1)

        # mean score
        mean_score = -(cv_score.mean())
        # log avg cross val error
        mlflow.log_metric("cross_val_error",mean_score)

        return mean_score

In [21]:
# create optuna study
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective,n_trials=50,n_jobs=-1,show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

    # train the model on best parameters
    best_lgbm = LGBMRegressor(**study.best_params)

    best_lgbm.fit(X_train_trans,y_train_pt)

    # get the predictions
    y_pred_train = best_lgbm.predict(X_train_trans)
    y_pred_test = best_lgbm.predict(X_test_trans)

    # get the actual predictions values
    y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
    y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))


    # perform cross validation
    model = TransformedTargetRegressor(regressor=best_lgbm,
                                        transformer=pt)


    scores = cross_val_score(model,
                         X_train_trans,
                         y_train,
                         scoring="neg_mean_absolute_error",
                         cv=5,n_jobs=-1)

    # log metrics
    mlflow.log_metric("training_error",mean_absolute_error(y_train,y_pred_train_org))
    mlflow.log_metric("test_error",mean_absolute_error(y_test,y_pred_test_org))
    mlflow.log_metric("training_r2",r2_score(y_train,y_pred_train_org))
    mlflow.log_metric("test_r2",r2_score(y_test,y_pred_test_org))
    mlflow.log_metric("cross_val",- scores.mean())

    # log the best model
    mlflow.sklearn.log_model(best_lgbm,artifact_path="model")

[I 2025-06-04 15:35:16,627] A new study created in memory with name: no-name-e636a961-8cf7-46f0-a500-f517adda51e1


  0%|          | 0/50 [00:00<?, ?it/s]

🏃 View run overjoyed-croc-985 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3/runs/510ba981ef404338b0f13db49f073527
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3
[I 2025-06-04 15:35:26,446] Trial 1 finished with value: 0.7411798403044892 and parameters: {'n_estimators': 58, 'max_depth': 33, 'learning_rate': 0.3840646639057794, 'subsample': 0.5036824744150168, 'min_child_weight': 9, 'min_split_gain': 4.1122002589313205, 'reg_lambda': 40.75857970512712}. Best is trial 1 with value: 0.7411798403044892.
🏃 View run traveling-fox-785 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3/runs/c4044e450bb44cedb67134ce73bd4251
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3
🏃 View run dazzling-roo-242 at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3/runs/cf51bdd6f5964f808b92093



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 315
[LightGBM] [Info] Number of data points in the train set: 14256, number of used features: 11
[LightGBM] [Info] Start training from score 0.979667




🏃 View run best_model at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3/runs/1ff45ab6f7e8435a917aecd94f26ac91
🧪 View experiment at: https://dagshub.com/sourav664/real-estate-price-prediction.mlflow/#/experiments/3


In [46]:
# best score

study.best_value

0.661745384496269

In [47]:
study.best_params

{'n_estimators': 132,
 'max_depth': 27,
 'learning_rate': 0.13267634953167556,
 'subsample': 0.9901200337561764,
 'min_child_weight': 7,
 'min_split_gain': 0.04574185885539227,
 'reg_lambda': 33.921562308342544}

In [48]:
lgm_params = {'n_estimators': 132,
 'max_depth': 27,
 'learning_rate': 0.13267634953167556,
 'subsample': 0.9901200337561764,
 'min_child_weight': 7,
 'min_split_gain': 0.04574185885539227,
 'reg_lambda': 33.921562308342544}

In [49]:
# train the model on best parameters

lgbm = LGBMRegressor(**lgm_params)

lgbm.fit(X_train_trans,y_train_pt)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 315
[LightGBM] [Info] Number of data points in the train set: 14249, number of used features: 11
[LightGBM] [Info] Start training from score 0.979666




In [50]:
# get the predictions
y_pred_train = lgbm.predict(X_train_trans)
y_pred_test = lgbm.predict(X_test_trans)

In [51]:
# get the actual predictions values

y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))

In [52]:
print(f"The train error is {mean_absolute_error(y_train,y_pred_train_org):.2f} cr")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test_org):.2f} cr")

The train error is 0.62 cr
The test error is 0.63 cr


In [53]:
print(f"The train r2 score is {r2_score(y_train,y_pred_train_org):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test_org):.2f}")

The train r2 score is 0.75
The test r2 score is 0.71
