In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("../Dataset/ProcessedFile.csv")

In [3]:
df.head()

Unnamed: 0,size_sq_ft,propertyType,bedrooms,latitude,longitude,localityName,suburbName,companyName,closest_metro_station_km,AP_dist_km,Aiims_dist_km,NDRLW_dist_km,price
0,400,Independent Floor,1,28.64101,77.284386,Swasthya Vihar,East Delhi,Other,0.577495,21.741188,11.119239,6.227231,9000
1,1050,Apartment,2,28.594969,77.298668,mayur vihar phase 1,East Delhi,Other,0.417142,21.401856,9.419061,9.217502,20000
2,2250,Independent Floor,2,28.641806,77.293922,Swasthya Vihar,East Delhi,Other,0.125136,22.620365,11.829486,7.159184,28000
3,1350,Independent Floor,2,28.644363,77.293228,Krishna Nagar,East Delhi,Other,0.371709,22.681201,11.982708,7.097348,28000
4,450,Apartment,2,28.594736,77.31115,New Ashok Nagar,East Delhi,Other,1.08776,22.59281,10.571573,10.263271,12500


In [36]:
X = df.drop(['price','localityName'], axis=1)
y = df['price']

In [37]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

## With One Hot Encoding

In [40]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X).toarray()
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType'])

In [41]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi'
 'onehotencoder__companyName_A R Realtors'
 'onehotencoder__companyName_AB Estate'
 'onehotencoder__companyName_AMIT CHHABRA'
 'onehotencoder__companyName_AMPM Realtors'
 'onehotencoder__companyName_AVS Realtors'
 'onehotencoder__companyName_Abhishek yadav'
 'onehotencoder__companyName_Ahuja properties'
 'onehotencoder__companyName_Angel Property Services'
 'onehotencoder__companyName_Arjun Raj'
 'onehotencoder__companyName_Ashish Bansal'
 'onehotencoder__companyName_Ashish Talwar'
 'onehotencoder__companyName_Atul Kumar'
 'onehotencoder__companyName_B Kumar and Brothers'
 'onehotencoder__companyName_Baghla Estates'
 'onehotencoder__companyName_Bhagirathi Estat

In [42]:
X_train['passthrough__propertyType'] = X_train['passthrough__propertyType'].astype(object)
X_test['passthrough__propertyType'] = X_test['passthrough__propertyType'].astype(object)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [30]:
# Hyperparameter tuning

In [49]:
cat_gbm = CatBoostRegressor(random_state=2023,)

params = {'learning_rate': [0.1, 0.15, 0.3, 0.35, 0.4, 0.5,0.6],
          'n_estimators': [25, 50, 75,85],
          'max_depth':[2, 3, 4, 5,6,7,8,9]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023,)
gcv = GridSearchCV(cat_gbm, param_grid=params,
                   cv=kfold, scoring='r2',n_jobs=-1)
gcv.fit(X_train, y_train)

0:	learn: 79495.4457991	total: 11.6ms	remaining: 861ms
1:	learn: 67151.6003888	total: 32.6ms	remaining: 1.19s
2:	learn: 59044.9587088	total: 41.3ms	remaining: 991ms
3:	learn: 49747.9220358	total: 48.9ms	remaining: 869ms
4:	learn: 47989.5254929	total: 52.1ms	remaining: 729ms
5:	learn: 44577.7543774	total: 59.2ms	remaining: 681ms
6:	learn: 41875.1150870	total: 66.9ms	remaining: 650ms
7:	learn: 39160.1445881	total: 72.1ms	remaining: 604ms
8:	learn: 34913.1950736	total: 79.4ms	remaining: 582ms
9:	learn: 33428.6877891	total: 84.1ms	remaining: 547ms
10:	learn: 31928.3881090	total: 90.8ms	remaining: 528ms
11:	learn: 30471.9986063	total: 97.1ms	remaining: 510ms
12:	learn: 29485.0966942	total: 103ms	remaining: 492ms
13:	learn: 26589.4356266	total: 110ms	remaining: 481ms
14:	learn: 25987.6607967	total: 117ms	remaining: 467ms
15:	learn: 25458.5831002	total: 123ms	remaining: 454ms
16:	learn: 25159.8758036	total: 131ms	remaining: 447ms
17:	learn: 23634.3326127	total: 139ms	remaining: 440ms
18:	lear

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=<catboost.core.CatBoostRegressor object at 0x0000025675FDEEE0>,
             n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.15, 0.3, 0.35, 0.4, 0.5, 0.6],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [25, 50, 75, 85]},
             scoring='r2')

In [50]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.4, 'max_depth': 7, 'n_estimators': 75}
0.723126960833764


In [53]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.8721512070469826


## Without One Hot Encoding

In [54]:
from sklearn.metrics import r2_score

In [71]:
categorical=['propertyType','suburbName','companyName']
X = df.drop(['localityName','price'],axis=1)
#X = df.iloc[:,0:12]
y = df['price']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType']) 


In [74]:
cat_gbm = CatBoostRegressor(random_state=2023,)

params = {'learning_rate': [0.1, 0.15, 0.3, 0.35, 0.4, 0.5,0.6],
          'n_estimators': [25, 50, 75,85],
          'max_depth':[2, 3, 4, 5,6,7,8,9]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023,)
gcv = GridSearchCV(cat_gbm, param_grid=params,
                   cv=kfold, scoring='r2',n_jobs=-1)
gcv.fit(X_train, y_train,verbose=False,cat_features=categorical)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=<catboost.core.CatBoostRegressor object at 0x000002567D9CC4F0>,
             n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.15, 0.3, 0.35, 0.4, 0.5, 0.6],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [25, 50, 75, 85]},
             scoring='r2')

In [75]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.4, 'max_depth': 9, 'n_estimators': 75}
0.8217416958942568


In [76]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.6947802463243997


## CatBoost with onehotencoding, without companyName

In [4]:
X = df.drop(['price','localityName','companyName'], axis=1)
y = df['price']

In [5]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [7]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X)
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType'])

In [8]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi' 'passthrough__size_sq_ft'
 'passthrough__propertyType' 'passthrough__bedrooms'
 'passthrough__latitude' 'passthrough__longitude'
 'passthrough__closest_metro_station_km' 'passthrough__AP_dist_km'
 'passthrough__Aiims_dist_km' 'passthrough__NDRLW_dist_km']


In [9]:
X_train['passthrough__propertyType'] = X_train['passthrough__propertyType'].astype(object)
X_test['passthrough__propertyType'] = X_test['passthrough__propertyType'].astype(object)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [10]:
cat_gbm = CatBoostRegressor(random_state=2023,)

params = {'learning_rate': [0.1, 0.15, 0.3, 0.35, 0.4, 0.5,0.6],
          'n_estimators': [25, 50, 75,85],
          'max_depth':[2, 3, 4, 5,6,7,8,9]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023,)
gcv = GridSearchCV(cat_gbm, param_grid=params,
                   cv=kfold, scoring='r2',n_jobs=-1)
gcv.fit(X_train, y_train)

0:	learn: 77078.9848761	total: 172ms	remaining: 8.44s
1:	learn: 66899.3356349	total: 197ms	remaining: 4.73s
2:	learn: 62669.9636864	total: 225ms	remaining: 3.52s
3:	learn: 52593.9675629	total: 257ms	remaining: 2.95s
4:	learn: 46821.1484475	total: 296ms	remaining: 2.67s
5:	learn: 44058.9526204	total: 318ms	remaining: 2.33s
6:	learn: 41629.1013041	total: 347ms	remaining: 2.13s
7:	learn: 36470.3766663	total: 385ms	remaining: 2.02s
8:	learn: 35526.8866064	total: 404ms	remaining: 1.84s
9:	learn: 34191.6211968	total: 414ms	remaining: 1.66s
10:	learn: 32827.8462991	total: 441ms	remaining: 1.56s
11:	learn: 29923.5572967	total: 468ms	remaining: 1.48s
12:	learn: 29431.4445585	total: 472ms	remaining: 1.34s
13:	learn: 28277.0857741	total: 520ms	remaining: 1.34s
14:	learn: 27484.7901397	total: 600ms	remaining: 1.4s
15:	learn: 26889.8416295	total: 654ms	remaining: 1.39s
16:	learn: 24896.3034871	total: 688ms	remaining: 1.34s
17:	learn: 24219.0817414	total: 738ms	remaining: 1.31s
18:	learn: 23884.2039

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=<catboost.core.CatBoostRegressor object at 0x00000253DF326970>,
             n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.15, 0.3, 0.35, 0.4, 0.5, 0.6],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                         'n_estimators': [25, 50, 75, 85]},
             scoring='r2')

In [11]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.4, 'max_depth': 9, 'n_estimators': 50}
0.7405551991660142


In [12]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.7215625066967828
