In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold,RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import optuna
from plotly.io import show

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('outliers_treated.csv')

In [3]:
df.drop(columns=['Unnamed: 0','price_per_sqft','areaWithType','outliers','area_room_ratio','store room','pooja room','others','area'],inplace=True)

df['luxury_cat'] = pd.cut(df['luxury_score'],bins=[-1,50,150,175],labels=['low','medium','high'])
df['floorNum_cat'] = pd.cut(df['floorNum'],bins=[-1,3,10,51],labels=['low','medium','high'])

df.drop(columns=['luxury_score','facing','floorNum','society'],inplace=True)

df.loc[df['agePossession'] == 'Undefined','agePossession'] = np.nan

df['furnishing_type'] = df['furnishing_type'].replace({0:'unfurnished',1:'semi_furnished',2:'furnished'})

In [4]:
df['property_type'] = df['property_type'].astype('category')
df['sector'] = df['sector'].astype('category')
df['balcony'] = df['balcony'].astype('category')
df['agePossession'] = df['agePossession'].astype('category')
df['furnishing_type'] = df['furnishing_type'].astype('category')

In [5]:
x = df.drop(columns=['price'])
y = df['price']

In [178]:
y = np.log1p(y)

In [179]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [180]:
missing_pipe_cat_age = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
    ('Onehot',OneHotEncoder(drop='first',handle_unknown='ignore'))
    ])

missing_pipe_cat_floor = Pipeline([
    ('Missing_Cat', SimpleImputer(strategy='most_frequent')),
    ('ordinal',OrdinalEncoder())
    ])

missing_pipe_num = Pipeline([
    ('Missing_Num', SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
     ])


Target_preprocessor = ColumnTransformer([
    ('missing_pipe_cat_age',missing_pipe_cat_age,['agePossession']),
    ('missing_pipe_cat_floor',missing_pipe_cat_floor,['floorNum_cat']),
    ('missing_pipe_num',missing_pipe_num,[ 'super_built_up_area','built_up_area', 'carpet_area']),
    ('ordinal',OrdinalEncoder(),['balcony','luxury_cat']),
    ('onehot',OneHotEncoder(drop='first', handle_unknown='ignore'), ['furnishing_type','property_type']),
    # ('target',ce.TargetEncoder(),['sector']),
    ('scaling',StandardScaler(),['bedRoom', 'bathroom', 'study room', 'servant room'])
    ],remainder='passthrough')

In [181]:
def objective(trial,preprocessor):

    regressor_name = trial.suggest_categorical('regressor',['xgboost'])


    if regressor_name == 'xgboost':

       gamma = trial.suggest_int('model__gamma',0,8)
       reg_lambda = trial.suggest_float('model__reg_lambda',0.1,10)
       n_estimators = trial.suggest_int('model__n_estimators',50,500)
       eta = trial.suggest_float('model__eta',0.01,1)
       min_child_weight = trial.suggest_int('model__min_child_weight',1,50)
       subsample = trial.suggest_float('model__subsample',0.5,1.0)
       colsample_bytree = trial.suggest_float('model__colsample_bytree',0.5,1.0)
       max_depth = trial.suggest_int('model__max_depth',3,10)
       tree_method = trial.suggest_categorical('model__tree_method',['hist'])
       max_bin = trial.suggest_int('model__max_bin',64,256)



       model_pipe = Pipeline([
       ('target',ce.TargetEncoder(cols=['sector'])),
       ('preprocessor',preprocessor),
       ('model',XGBRegressor(random_state=0,gamma=gamma,reg_lambda=reg_lambda,n_estimators=n_estimators,eta=eta,
                            min_child_weight=min_child_weight,max_bin=max_bin,
                            subsample=subsample,colsample_bytree=colsample_bytree,max_depth=max_depth,tree_method=tree_method))    

       ])

       
       
       trial.set_user_attr('model',model_pipe)

       cv_score = cross_val_score(model_pipe,x_train,y_train,cv=KFold(n_splits=5,shuffle=True,random_state=0),
                                  scoring='r2')
       
       trial.set_user_attr('cv_std',cv_score.std())

       return cv_score.mean()
       

In [182]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objective(trial,Target_preprocessor),n_trials=100,n_jobs=1)

[I 2025-08-29 10:01:51,416] A new study created in memory with name: no-name-06ae7ab7-97e2-4062-ad3e-d448935741ae
[I 2025-08-29 10:01:51,889] Trial 0 finished with value: 0.7879698564870256 and parameters: {'regressor': 'xgboost', 'model__gamma': 4, 'model__reg_lambda': 7.180374727086953, 'model__n_estimators': 321, 'model__eta': 0.5494343511669278, 'model__min_child_weight': 22, 'model__subsample': 0.8229470565333281, 'model__colsample_bytree': 0.7187936056313462, 'model__max_depth': 10, 'model__tree_method': 'hist', 'model__max_bin': 249}. Best is trial 0 with value: 0.7879698564870256.
[I 2025-08-29 10:01:52,282] Trial 1 finished with value: 0.7847614416503703 and parameters: {'regressor': 'xgboost', 'model__gamma': 3, 'model__reg_lambda': 7.9380778770183795, 'model__n_estimators': 288, 'model__eta': 0.572364115482993, 'model__min_child_weight': 47, 'model__subsample': 0.5355180290989434, 'model__colsample_bytree': 0.5435646498507704, 'model__max_depth': 3, 'model__tree_method': 'hi

In [183]:
study.best_params

{'regressor': 'xgboost',
 'model__gamma': 0,
 'model__reg_lambda': 8.49270037283895,
 'model__n_estimators': 157,
 'model__eta': 0.13395665226132697,
 'model__min_child_weight': 1,
 'model__subsample': 0.918106100688268,
 'model__colsample_bytree': 0.8607389343401065,
 'model__max_depth': 10,
 'model__tree_method': 'hist',
 'model__max_bin': 223}

In [184]:
study.best_value

0.8833253632662788

In [185]:
cv_std = study.best_trial.user_attrs['cv_std']
cv_std

np.float64(0.0023058779278119436)

# final model

In [186]:
final_model = study.best_trial.user_attrs['model']
final_model.fit(x_train,y_train)
y_pred = final_model.predict(x_test)

y_pred = np.expm1(y_pred)


print(f"Test_r2_score:{r2_score(np.expm1(y_test), y_pred)}")
print(f"Test_mse_score:{mean_squared_error(np.expm1(y_test), y_pred)}")

print(f"Test_mae_score:{mean_absolute_error(np.expm1(y_test), y_pred)}")


Test_r2_score:0.8368348565246672
Test_mse_score:1.054224199027196
Test_mae_score:0.4640351533350879


In [187]:
y_test.max()

np.float64(3.044522437723423)

In [188]:
fig1 = optuna.visualization.plot_optimization_history(study)
fig2 = optuna.visualization.plot_param_importances(study)
show(fig1)
show(fig2)

# Exporting Model

In [None]:
# import pickle

# with open('final_model.pkl', 'wb') as file:
#     pickle.dump(final_model, file)

In [7]:
with open('df.pkl', 'wb') as file:
    pickle.dump(x, file)

In [11]:
x['floorNum_cat'].unique()

['low', 'medium', 'high', NaN]
Categories (3, object): ['low' < 'medium' < 'high']

## Saving final dataframe

In [190]:
# df.to_csv('final_df.csv')

# Importing Model

In [191]:
# # Open and read the pickle file
# with open('final_model.pkl', 'rb') as file:  # 'rb' means read binary
#     data = pickle.load(file)

# print(data)

# Predicting (same format as the training df)

In [192]:
x

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,super_built_up_area,built_up_area,carpet_area,study room,servant room,furnishing_type,luxury_cat,floorNum_cat
0,flat,sector 36,3,2,2,New Property,1081.0,,650.0,0,0,unfurnished,low,low
1,flat,sector 89,2,2,2,New Property,,,1103.0,1,1,unfurnished,low,medium
2,flat,sohna road,2,2,1,New Property,,1000.0,585.0,0,0,unfurnished,low,high
3,flat,sector 92,3,4,3+,Relatively New,1995.0,1615.0,1476.0,0,1,semi_furnished,high,medium
4,flat,sector 102,2,2,1,Relatively New,632.0,,532.0,0,0,unfurnished,high,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3644,flat,sector 84,2,2,1,Relatively New,583.0,,483.0,0,0,unfurnished,medium,medium
3645,house,sector 109,5,5,3+,Relatively New,,6228.0,,1,1,unfurnished,high,low
3646,flat,sector 2,1,1,1,Moderately Old,735.0,,,0,0,semi_furnished,medium,medium
3647,house,sector 43,5,6,3,Moderately Old,,5490.0,,1,1,unfurnished,medium,low


In [193]:
columns = ['property_type', 'sector','bedRoom', 'bathroom', 'balcony',
       'agePossession', 'super_built_up_area', 'built_up_area', 'carpet_area',
       'study room', 'servant room', 'furnishing_type', 'luxury_cat',
       'floorNum_cat']

In [194]:
x.iloc[1]

property_type                  flat
sector                    sector 89
bedRoom                           2
bathroom                          2
balcony                           2
agePossession          New Property
super_built_up_area             NaN
built_up_area                   NaN
carpet_area                  1103.0
study room                        1
servant room                      1
furnishing_type         unfurnished
luxury_cat                      low
floorNum_cat                 medium
Name: 1, dtype: object

In [195]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property',3000, 2750,2500, 0, 0, 'unfurnished', 'low', 'low']]

data = pd.DataFrame(data,columns=columns)
data

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,super_built_up_area,built_up_area,carpet_area,study room,servant room,furnishing_type,luxury_cat,floorNum_cat
0,house,sector 102,4,3,3+,New Property,3000,2750,2500,0,0,unfurnished,low,low


In [196]:
data['property_type'] = data['property_type'].astype('category')
data['sector'] = data['sector'].astype('category')
data['balcony'] = data['balcony'].astype('category')
data['agePossession'] = data['agePossession'].astype('category')
data['furnishing_type'] = data['furnishing_type'].astype('category')

In [197]:
(final_model.predict(data))

array([1.3663626], dtype=float32)