In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold,RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import optuna
from plotly.io import show

pd.set_option('display.max_columns', None)

In [52]:
df = pd.read_csv('final_df.csv')

In [53]:
x = df.drop('price',axis=1)
y = df['price']
y = np.log1p(y)

In [54]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [55]:

missing_pipe_cat_age = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
    ('Onehot',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False))
    ])

missing_pipe_cat_floor = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
    ])

missing_pipe_num = Pipeline([
    ('Missing_Num', SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
     ])


Target_preprocessor = ColumnTransformer([
    ('missing_pipe_cat_age',missing_pipe_cat_age,['agePossession']),
    ('missing_pipe_cat_floor',missing_pipe_cat_floor,['floorNum_cat']),
    ('missing_pipe_num',missing_pipe_num,[ 'super_built_up_area','built_up_area', 'carpet_area']),
    ('ordinal',OrdinalEncoder(),['balcony','luxury_cat']),
    ('onehot',OneHotEncoder(drop='first', handle_unknown='ignore'), ['furnishing_type','property_type']),
    # ('target',ce.TargetEncoder(),['sector']),
    ('scaling',StandardScaler(),['bedRoom', 'bathroom', 'study room', 'servant room'])
    ],remainder='passthrough')



In [56]:
def objective(trial,preprocessor):

    regressor_name = trial.suggest_categorical('regressor',['Extra_Tree'])

    if regressor_name == 'Extra_Tree':

       n_estimators = trial.suggest_int('model__n_estimators',30, 500)
       max_depth = trial.suggest_int('model__max_depth',5, 50)
       min_samples_split = trial.suggest_int('model__min_samples_split',2, 20)
       min_samples_leaf = trial.suggest_int('model__min_samples_leaf',1, 10)
       max_features = trial.suggest_categorical('model__max_features', ['sqrt', 'log2',0.3, 0.5, 0.7, 0.9])
       bootstrap = trial.suggest_categorical('model__bootstrap', [True, False])


       model_pipe = Pipeline([
       ('target',ce.TargetEncoder(cols=['sector'])),
       ('preprocessor',preprocessor),
       ('model',ExtraTreesRegressor(random_state=0,n_estimators=n_estimators,max_depth=max_depth,
                                    min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
                                    n_jobs=-1,max_features=max_features,bootstrap=bootstrap))
       ])

    
       trial.set_user_attr('model',model_pipe)
    
    cv_score = cross_val_score(model_pipe,x_train,y_train,cv = KFold(n_splits=5,shuffle=True,random_state=0),scoring='r2',n_jobs=1)
    
    return cv_score.mean()
       

In [57]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial : objective(trial,Target_preprocessor),n_trials=100,n_jobs=1)

[I 2025-08-29 10:04:56,732] A new study created in memory with name: no-name-c94ad7ba-6486-4bc3-b8da-4c723c7ea24b
[I 2025-08-29 10:04:58,483] Trial 0 finished with value: 0.8316059040180843 and parameters: {'regressor': 'Extra_Tree', 'model__n_estimators': 288, 'model__max_depth': 37, 'model__min_samples_split': 13, 'model__min_samples_leaf': 6, 'model__max_features': 0.7, 'model__bootstrap': True}. Best is trial 0 with value: 0.8316059040180843.
[I 2025-08-29 10:05:00,135] Trial 1 finished with value: 0.8699139736316523 and parameters: {'regressor': 'Extra_Tree', 'model__n_estimators': 297, 'model__max_depth': 47, 'model__min_samples_split': 3, 'model__min_samples_leaf': 1, 'model__max_features': 0.7, 'model__bootstrap': False}. Best is trial 1 with value: 0.8699139736316523.
[I 2025-08-29 10:05:00,725] Trial 2 finished with value: 0.8182471720727941 and parameters: {'regressor': 'Extra_Tree', 'model__n_estimators': 85, 'model__max_depth': 34, 'model__min_samples_split': 4, 'model__mi

In [58]:
study.best_params

{'regressor': 'Extra_Tree',
 'model__n_estimators': 263,
 'model__max_depth': 24,
 'model__min_samples_split': 3,
 'model__min_samples_leaf': 1,
 'model__max_features': 0.7,
 'model__bootstrap': False}

In [59]:
study.best_value

0.8701468690449927

In [60]:
final_model = study.best_trial.user_attrs['model']
final_model.fit(x_train,y_train)
y_pred = final_model.predict(x_test)

y_pred = np.expm1(y_pred)
y_test = np.expm1(y_test)

print(f"Test_MSE:{mean_squared_error(y_test,y_pred)}")
print(f"Test_MAE:{mean_absolute_error(y_test,y_pred)}")
print(f"Test_R2:{r2_score(y_test,y_pred)}")

Test_MSE:1.496080557287415
Test_MAE:0.4840356889392869
Test_R2:0.7993252671947587


In [61]:
fig1 = optuna.visualization.plot_optimization_history(study)
show(fig1)

In [62]:
fig2 = optuna.visualization.plot_param_importances(study)
show(fig2)