In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold,RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import optuna
from plotly.io import show

pd.set_option('display.max_columns', None)

In [82]:
df = pd.read_csv('outliers_treated.csv')

In [83]:
df.drop(columns=['Unnamed: 0','price_per_sqft','areaWithType','outliers','area_room_ratio','store room','pooja room','others','area'],inplace=True)

df['luxury_cat'] = pd.cut(df['luxury_score'],bins=[-1,50,150,175],labels=['low','medium','high'])
df['floorNum_cat'] = pd.cut(df['floorNum'],bins=[-1,3,10,51],labels=['low','medium','high'])

df.drop(columns=['luxury_score','facing','floorNum','society'],inplace=True)

df.loc[df['agePossession'] == 'Undefined','agePossession'] = np.nan

df['furnishing_type'] = df['furnishing_type'].replace({0:'unfurnished',1:'semi_furnished',2:'furnished'})

In [84]:
x = df.drop(columns=['price'])
y = df['price']

In [85]:
y = np.log1p(y)

In [86]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [87]:
def objective(trial):

    missing_pipe_cat_age = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
    ('Onehot',OneHotEncoder(drop='first',handle_unknown='ignore'))
    ])

    missing_pipe_cat_floor = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder())
    ])

    missing_pipe_num = Pipeline([
    ('Missing_Num', SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
     ])


    Target_preprocessor = ColumnTransformer([
    ('missing_pipe_cat_age',missing_pipe_cat_age,['agePossession']),
    ('missing_pipe_cat_floor',missing_pipe_cat_floor,['floorNum_cat']),
    ('missing_pipe_num',missing_pipe_num,[ 'super_built_up_area','built_up_area', 'carpet_area']),
    ('ordinal',OrdinalEncoder(),['balcony','luxury_cat']),
    ('onehot',OneHotEncoder(drop='first', handle_unknown='ignore'), ['furnishing_type','property_type']),
    ('target',ce.TargetEncoder(),['sector']),
    ('scaling',StandardScaler(),['bedRoom', 'bathroom', 'study room', 'servant room'])
    ],remainder='passthrough')


    regressor_name = trial.suggest_categorical('regressor',['xgboost'])


    if regressor_name == 'xgboost':

       gamma = trial.suggest_int('model__gamma',0,10)
       reg_lambda = trial.suggest_float('model__reg_lambda',0.1,10)
       n_estimators = trial.suggest_int('model__n_estimators',100,1000)
       eta = trial.suggest_float('model__eta',0.01,0.5)
       min_child_weight = trial.suggest_int('model__min_child_weight',1,50)
       subsample = trial.suggest_float('model__subsample',0.5,1.0)
       colsample_bynode = trial.suggest_float('model__colsample_bynode',0.5,1.0)
       max_depth = trial.suggest_int('model__max_depth',3,6)
       tree_method = trial.suggest_categorical('model__tree_method',['hist'])
       max_bin = trial.suggest_int('model__max_bin',64,256)



       model_pipe = Pipeline([
       ('preprocessor',Target_preprocessor),
       ('model',XGBRegressor(random_state=0,gamma=gamma,reg_lambda=reg_lambda,n_estimators=n_estimators,eta=eta,
                            min_child_weight=min_child_weight,max_bin=max_bin,
                            subsample=subsample,colsample_bynode=colsample_bynode,max_depth=max_depth,tree_method=tree_method))    

       ])

       
       
       trial.set_user_attr('model',model_pipe)

       cv_score = cross_val_score(model_pipe,x,y,cv=5,scoring='r2')

       return cv_score.mean()
       

In [None]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(objective,n_trials=50,n_jobs=-1)

best_params = study.best_params
best_score = study.best_value


print(f"Best Cross Val Training score: {best_score}")


[I 2025-08-28 15:51:21,907] A new study created in memory with name: no-name-be6acd23-aef7-43d0-bc25-563467a4e0ea
[I 2025-08-28 15:51:25,145] Trial 0 finished with value: 0.7729263758710296 and parameters: {'regressor': 'xgboost', 'model__gamma': 9, 'model__reg_lambda': 1.322088554517635, 'model__n_estimators': 356, 'model__eta': 0.12462390356024645, 'model__min_child_weight': 25, 'model__subsample': 0.6797759215757097, 'model__colsample_bynode': 0.7487505113494488, 'model__max_depth': 4, 'model__tree_method': 'hist', 'model__max_bin': 94}. Best is trial 0 with value: 0.7729263758710296.
[I 2025-08-28 15:51:25,599] Trial 5 finished with value: 0.7746682041957743 and parameters: {'regressor': 'xgboost', 'model__gamma': 7, 'model__reg_lambda': 2.1531140807664317, 'model__n_estimators': 430, 'model__eta': 0.3200795872998935, 'model__min_child_weight': 25, 'model__subsample': 0.553111852518445, 'model__colsample_bynode': 0.533936208267425, 'model__max_depth': 5, 'model__tree_method': 'hist

Best Cross Val Training accuracy: 0.8813979550995381


In [89]:
study.best_params

{'regressor': 'xgboost',
 'model__gamma': 0,
 'model__reg_lambda': 5.344362124446211,
 'model__n_estimators': 381,
 'model__eta': 0.15928244596415175,
 'model__min_child_weight': 18,
 'model__subsample': 0.8777139152416464,
 'model__colsample_bynode': 0.9487130235618204,
 'model__max_depth': 4,
 'model__tree_method': 'hist',
 'model__max_bin': 170}

In [90]:
study.best_value

0.8813979550995381

In [None]:
final_model = study.best_trial.user_attrs['model']
final_model.fit(x_train,y_train)
y_pred = final_model.predict(x_test)

y_pred = np.expm1(y_pred)


print(f"Test_r2_score:{r2_score(np.expm1(y_test), y_pred)}")
print(f"Test_mse_score:{mean_squared_error(np.expm1(y_test), y_pred)}")

print(f"Test_mae_score:{mean_absolute_error(np.expm1(y_test), y_pred)}")


Test_r2_score:0.8382921571080147
Test_mse_score:1.044808453068846
Test_mae_score:0.49423883482119807


In [93]:
fig1 = optuna.visualization.plot_optimization_history(study)
fig2 = optuna.visualization.plot_param_importances(study)
show(fig1)
show(fig2)

# Importing Model

In [94]:
import pickle

with open('final_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)

In [96]:
# Open and read the pickle file
with open('final_model.pkl', 'rb') as file:  # 'rb' means read binary
    data = pickle.load(file)

print(data)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('missing_pipe_cat_age',
                                                  Pipeline(steps=[('Missing_Cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('Onehot',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  ['agePossession']),
                                                 ('missing_pipe_cat_floor',
                                                  Pipeline(steps=[('Missing_Cat',
                                                                   SimpleImputer(strategy='m...
                              eval_m

# Predicting (same format as the training df)

In [97]:
x

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,super_built_up_area,built_up_area,carpet_area,study room,servant room,furnishing_type,luxury_cat,floorNum_cat
0,flat,sector 36,3,2,2,New Property,1081.0,,650.0,0,0,unfurnished,low,low
1,flat,sector 89,2,2,2,New Property,,,1103.0,1,1,unfurnished,low,medium
2,flat,sohna road,2,2,1,New Property,,1000.0,585.0,0,0,unfurnished,low,high
3,flat,sector 92,3,4,3+,Relatively New,1995.0,1615.0,1476.0,0,1,semi_furnished,high,medium
4,flat,sector 102,2,2,1,Relatively New,632.0,,532.0,0,0,unfurnished,high,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,flat,sector 84,2,2,1,Relatively New,583.0,,483.0,0,0,unfurnished,medium,medium
3645,house,sector 109,5,5,3+,Relatively New,,6228.0,,1,1,unfurnished,high,low
3646,flat,sector 2,1,1,1,Moderately Old,735.0,,,0,0,semi_furnished,medium,medium
3647,house,sector 43,5,6,3,Moderately Old,,5490.0,,1,1,unfurnished,medium,low


In [98]:
columns = ['property_type', 'sector','bedRoom', 'bathroom', 'balcony',
       'agePossession', 'super_built_up_area', 'built_up_area', 'carpet_area',
       'study room', 'servant room', 'furnishing_type', 'luxury_cat',
       'floorNum_cat']

In [105]:
x.iloc[1]

property_type                  flat
sector                    sector 89
bedRoom                           2
bathroom                          2
balcony                           2
agePossession          New Property
super_built_up_area             NaN
built_up_area                   NaN
carpet_area                  1103.0
study room                        1
servant room                      1
furnishing_type         unfurnished
luxury_cat                      low
floorNum_cat                 medium
Name: 1, dtype: object

In [100]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property',3000, 2750,2500, 0, 0, 'unfurnished', 'low', 'low']]

data = pd.DataFrame(data,columns=columns)
data

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,super_built_up_area,built_up_area,carpet_area,study room,servant room,furnishing_type,luxury_cat,floorNum_cat
0,house,sector 102,4,3,3+,New Property,3000,2750,2500,0,0,unfurnished,low,low


In [101]:
data['property_type'] = data['property_type'].astype('category')
data['sector'] = data['sector'].astype('category')
data['balcony'] = data['balcony'].astype('category')
data['agePossession'] = data['agePossession'].astype('category')
data['furnishing_type'] = data['furnishing_type'].astype('category')

In [102]:
np.expm1(final_model.predict(data))

array([3.8489723], dtype=float32)