In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,TargetEncoder,PowerTransformer
from sklearn.compose import ColumnTransformer,TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold,RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import optuna
from plotly.io import show
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [4]:
df = pd.read_csv('final_data.csv')
df = df.drop(['Title','Price_per_sqft','Is_in_Dhaka'],axis=1)
df.head()

Unnamed: 0,Bedrooms,Bathrooms,Floor_area,City,location_area,Price_in_Cr,floor_level
0,3.0,4.0,1960.0,dhaka,gulshan,3.9,low
1,3.0,3.0,1705.0,dhaka,kalabagan,1.69,low
2,3.0,3.0,1370.0,dhaka,dhanmondi,1.25,medium
3,3.0,3.0,2125.0,dhaka,bashundhara,2.0,medium
4,3.0,3.0,2687.0,dhaka,banani,4.75,medium


In [5]:
x = df.drop('Price_in_Cr',axis=1)
y = df['Price_in_Cr']

# Optuna

In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


floor_area_pipe = Pipeline([
    ('log_transform',PowerTransformer(method='yeo-johnson')),
    ('scalar',StandardScaler())
])

transformer = ColumnTransformer([
    ("floor_area",floor_area_pipe,['Floor_area']),
    ("log_transform",PowerTransformer(method='yeo-johnson'),['Floor_area']),
    ('Onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),['City','Bedrooms','Bathrooms','location_area']),
    ('Ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),['floor_level']),
],remainder='passthrough')


def objective(trial,preprocessor):
    regressor_name = trial.suggest_categorical('regressor',['GradientBoostingRegressor'])

    if regressor_name == 'GradientBoostingRegressor':
        n_estimators = trial.suggest_int('model__n_estimators',50,300)
        learning_rate = trial.suggest_float('model__learning_rate',0.05,0.3)
        max_features = trial.suggest_categorical('model__max_features',['sqrt', 'log2'])
        subsample = trial.suggest_float('model__subsample',0.7,1.0)
        max_depth = trial.suggest_int('model__max_depth',3,6)
        min_samples_leaf = trial.suggest_int('model__min_samples_leaf',1,5)
        min_samples_split = trial.suggest_int('model__min_samples_split',2,5)


    model_pipe = Pipeline([
        ('preprocessor',transformer),
        ('model',GradientBoostingRegressor(n_estimators=n_estimators,learning_rate=learning_rate,max_features=max_features,
                                           max_depth=max_depth,min_samples_leaf=min_samples_leaf,
                                           min_samples_split=min_samples_split,subsample=subsample,random_state=0))
                                           ])

    final_pipe = TransformedTargetRegressor(regressor=model_pipe,func=np.log1p,inverse_func=np.expm1)

    trial.set_user_attr('model',final_pipe)

    cv_score = cross_val_score(final_pipe,x_train,y_train,cv=KFold(n_splits=10,shuffle=True,random_state=0),scoring='neg_mean_squared_error')
    return cv_score.mean()


study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=0),pruner=optuna.pruners.HyperbandPruner())
study.optimize(lambda trial : objective(trial,preprocessor=transformer),n_trials=100,n_jobs=-1)
print(f'Best neg score: {study.best_value}')
print(f'Best Params: {study.best_params}')

final_model = study.best_trial.user_attrs['model']
final_model.fit(x_train,y_train)
y_pred = final_model.predict(x_test)

print(f'MSE: {mean_squared_error(y_test,y_pred)}')
print(f'MAE: {mean_absolute_error(y_test,y_pred)}')
print(f'R2: {r2_score(y_test,y_pred)}')

[I 2025-09-26 17:35:59,016] A new study created in memory with name: no-name-17841f17-36de-49e5-8e2c-83fbdffb7243
[I 2025-09-26 17:36:06,824] Trial 2 finished with value: -0.10443517900930888 and parameters: {'regressor': 'GradientBoostingRegressor', 'model__n_estimators': 62, 'model__learning_rate': 0.11538484407003717, 'model__max_features': 'sqrt', 'model__subsample': 0.8296169748742077, 'model__max_depth': 4, 'model__min_samples_leaf': 5, 'model__min_samples_split': 3}. Best is trial 2 with value: -0.10443517900930888.
[I 2025-09-26 17:36:08,442] Trial 11 finished with value: -0.07920499876749743 and parameters: {'regressor': 'GradientBoostingRegressor', 'model__n_estimators': 79, 'model__learning_rate': 0.13881757032405923, 'model__max_features': 'sqrt', 'model__subsample': 0.9991018243666185, 'model__max_depth': 6, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2}. Best is trial 11 with value: -0.07920499876749743.
[I 2025-09-26 17:36:08,646] Trial 1 finished with valu

Best neg score: -0.06200511376757413
Best Params: {'regressor': 'GradientBoostingRegressor', 'model__n_estimators': 134, 'model__learning_rate': 0.2700045559489739, 'model__max_features': 'sqrt', 'model__subsample': 0.9961388057454358, 'model__max_depth': 4, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5}
MSE: 0.06276607269743066
MAE: 0.14593791828398273
R2: 0.8718923585338596
