In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("../Dataset/ProcessedFile.csv")

In [3]:
df.head()

Unnamed: 0,size_sq_ft,propertyType,bedrooms,latitude,longitude,localityName,suburbName,companyName,closest_metro_station_km,AP_dist_km,Aiims_dist_km,NDRLW_dist_km,price
0,400,Independent Floor,1,28.64101,77.284386,Swasthya Vihar,East Delhi,Other,0.577495,21.741188,11.119239,6.227231,9000
1,1050,Apartment,2,28.594969,77.298668,mayur vihar phase 1,East Delhi,Other,0.417142,21.401856,9.419061,9.217502,20000
2,2250,Independent Floor,2,28.641806,77.293922,Swasthya Vihar,East Delhi,Other,0.125136,22.620365,11.829486,7.159184,28000
3,1350,Independent Floor,2,28.644363,77.293228,Krishna Nagar,East Delhi,Other,0.371709,22.681201,11.982708,7.097348,28000
4,450,Apartment,2,28.594736,77.31115,New Ashok Nagar,East Delhi,Other,1.08776,22.59281,10.571573,10.263271,12500


In [4]:
df=pd.read_csv("Dataset/ProcessedFile.csv")

In [5]:
df.head()

Unnamed: 0,size_sq_ft,propertyType,bedrooms,latitude,longitude,localityName,suburbName,companyName,closest_metro_station_km,AP_dist_km,Aiims_dist_km,NDRLW_dist_km,price
0,400,Independent Floor,1,28.64101,77.284386,Swasthya Vihar,East Delhi,Other,0.577495,21.741188,11.119239,6.227231,9000
1,1050,Apartment,2,28.594969,77.298668,mayur vihar phase 1,East Delhi,Other,0.417142,21.401856,9.419061,9.217502,20000
2,2250,Independent Floor,2,28.641806,77.293922,Swasthya Vihar,East Delhi,Other,0.125136,22.620365,11.829486,7.159184,28000
3,1350,Independent Floor,2,28.644363,77.293228,Krishna Nagar,East Delhi,Other,0.371709,22.681201,11.982708,7.097348,28000
4,450,Apartment,2,28.594736,77.31115,New Ashok Nagar,East Delhi,Other,1.08776,22.59281,10.571573,10.263271,12500


In [6]:
X = df.drop(['price','localityName'], axis=1)
y = df['price']

In [7]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType']) 

In [9]:
ohc = OneHotEncoder(handle_unknown='ignore')
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf_trn = ct.fit_transform(X_train).toarray()
X_transf_trn = pd.DataFrame(X_transf_trn, columns=ct.get_feature_names_out())

In [10]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi'
 'onehotencoder__companyName_A R Realtors'
 'onehotencoder__companyName_AB Estate'
 'onehotencoder__companyName_AMIT CHHABRA'
 'onehotencoder__companyName_AMPM Realtors'
 'onehotencoder__companyName_AVS Realtors'
 'onehotencoder__companyName_Abhishek yadav'
 'onehotencoder__companyName_Ahuja properties'
 'onehotencoder__companyName_Angel Property Services'
 'onehotencoder__companyName_Arjun Raj'
 'onehotencoder__companyName_Ashish Bansal'
 'onehotencoder__companyName_Ashish Talwar'
 'onehotencoder__companyName_Atul Kumar'
 'onehotencoder__companyName_B Kumar and Brothers'
 'onehotencoder__companyName_Baghla Estates'
 'onehotencoder__companyName_Bhagirathi Estat

In [11]:
cat_gbm = CatBoostRegressor(random_state=2023,)
dtr = DecisionTreeRegressor(random_state=2023)
rf=RandomForestRegressor(random_state=2023)
xgbm = XGBRegressor(random_state=2023,n_jobs=-1)
stack = StackingRegressor(estimators=[('CAT',cat_gbm),('TREE',dtr),('XGBM',xgbm)],
                          final_estimator=rf, n_jobs=-1,
                          passthrough=True)

In [12]:
print(stack.get_params())

{'cv': None, 'estimators': [('CAT', <catboost.core.CatBoostRegressor object at 0x000002889D59EA60>), ('TREE', DecisionTreeRegressor(random_state=2023)), ('XGBM', XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=-1, num_parallel_tree=None,
             predictor=None, random_state=2023, ...))], 'final_estimator__bootstrap': True, 'final_estimator__ccp_alpha': 0.0, 'final_estimator__criteri

In [13]:
params = {'CAT__learning_rate': [0.1, 0.3],
          'CAT__n_estimators': [25, 50],
          'CAT__max_depth':[2, 3, ],
          'TREE__max_depth':[2,4],
          'TREE__min_samples_split':[2,5],
          'TREE__min_samples_leaf':[1,4],
           'XGBM__learning_rate':[0.5],
          'XGBM__n_estimators':[20,30],
          'XGBM__max_depth': [3,5],
          'final_estimator__max_features':[3,5],
        'final_estimator__n_estimators':[24,50]
          }
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
gcv = GridSearchCV(stack, param_grid=params, cv=kfold, verbose=1,
                   n_jobs=-1, scoring='r2')

In [14]:
gcv.fit(X_transf_trn, y_train)

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=StackingRegressor(estimators=[('CAT',
                                                      <catboost.core.CatBoostRegressor object at 0x000002889D59EA60>),
                                                     ('TREE',
                                                      DecisionTreeRegressor(random_state=2023)),
                                                     ('XGBM',
                                                      XGBRegressor(base_score=None,
                                                                   booster=None,
                                                                   callbacks=None,
                                                                   colsample_bylevel=None,
                                                                   colsample_bynode=None,
                                                                   colsamp...
             p

In [15]:
print(gcv.best_score_)


best_model = gcv.best_estimator_

X_transf_tst = ct.transform(X_test).toarray()

y_pred = best_model.predict(X_transf_tst)
y_pred

0.7749483317458736


array([ 15416.66666667, 174560.875     ,  29916.66666667, ...,
        15541.66666667,  42187.5       ,  16866.66666667])

In [16]:
print(gcv.best_params_)
print(gcv.best_score_)

{'CAT__learning_rate': 0.3, 'CAT__max_depth': 2, 'CAT__n_estimators': 50, 'TREE__max_depth': 2, 'TREE__min_samples_leaf': 1, 'TREE__min_samples_split': 2, 'XGBM__learning_rate': 0.5, 'XGBM__max_depth': 5, 'XGBM__n_estimators': 30, 'final_estimator__max_features': 5, 'final_estimator__n_estimators': 24}
0.7749483317458736


In [17]:
best_model = gcv.best_estimator_
X_transf_tst = ct.transform(X_test).toarray()
y_pred = best_model.predict(X_transf_tst)
y_pred

array([ 15416.66666667, 174560.875     ,  29916.66666667, ...,
        15541.66666667,  42187.5       ,  16866.66666667])

In [18]:
print(r2_score(y_test, y_pred))

0.8549202867207247
