## DecisionTree With companyName

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("../Dataset/ProcessedFile.csv")

In [3]:
df.head()

Unnamed: 0,size_sq_ft,propertyType,bedrooms,latitude,longitude,localityName,suburbName,companyName,closest_metro_station_km,AP_dist_km,Aiims_dist_km,NDRLW_dist_km,price
0,400,Independent Floor,1,28.64101,77.284386,Swasthya Vihar,East Delhi,Other,0.577495,21.741188,11.119239,6.227231,9000
1,1050,Apartment,2,28.594969,77.298668,mayur vihar phase 1,East Delhi,Other,0.417142,21.401856,9.419061,9.217502,20000
2,2250,Independent Floor,2,28.641806,77.293922,Swasthya Vihar,East Delhi,Other,0.125136,22.620365,11.829486,7.159184,28000
3,1350,Independent Floor,2,28.644363,77.293228,Krishna Nagar,East Delhi,Other,0.371709,22.681201,11.982708,7.097348,28000
4,450,Apartment,2,28.594736,77.31115,New Ashok Nagar,East Delhi,Other,1.08776,22.59281,10.571573,10.263271,12500


In [4]:
X = df.drop(['price','localityName'], axis=1)
y = df['price']

In [5]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [6]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X).toarray()
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType'])

In [7]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi'
 'onehotencoder__companyName_A R Realtors'
 'onehotencoder__companyName_AB Estate'
 'onehotencoder__companyName_AMIT CHHABRA'
 'onehotencoder__companyName_AMPM Realtors'
 'onehotencoder__companyName_AVS Realtors'
 'onehotencoder__companyName_Abhishek yadav'
 'onehotencoder__companyName_Ahuja properties'
 'onehotencoder__companyName_Angel Property Services'
 'onehotencoder__companyName_Arjun Raj'
 'onehotencoder__companyName_Ashish Bansal'
 'onehotencoder__companyName_Ashish Talwar'
 'onehotencoder__companyName_Atul Kumar'
 'onehotencoder__companyName_B Kumar and Brothers'
 'onehotencoder__companyName_Baghla Estates'
 'onehotencoder__companyName_Bhagirathi Estat

In [8]:
X_train['passthrough__propertyType'] = X_train['passthrough__propertyType'].astype(object)
X_test['passthrough__propertyType'] = X_test['passthrough__propertyType'].astype(object)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [9]:
# Hyperparameter Tuning

In [10]:
dtr = DecisionTreeRegressor(random_state=2023)
params = {'max_depth':[None,2,3,4,5,6,7,8],
          'min_samples_split':np.arange(2, 21),
          'min_samples_leaf':np.arange(2, 21)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
gcv = GridSearchCV(dtr, param_grid=params, cv=kfold, scoring='r2',n_jobs=-1)
gcv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=DecisionTreeRegressor(random_state=2023), n_jobs=-1,
             param_grid={'max_depth': [None, 2, 3, 4, 5, 6, 7, 8],
                         'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20]),
                         'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20])},
             scoring='r2')

In [11]:
print(gcv.best_params_)
print(gcv.best_score_)

{'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 8}
0.585756416029843


In [12]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.708591474272198


## Without companyName

In [13]:
X = df.drop(['price','localityName','companyName'], axis=1)
y = df['price']

In [14]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [22]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X)
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType'])

In [23]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi' 'passthrough__size_sq_ft'
 'passthrough__propertyType' 'passthrough__bedrooms'
 'passthrough__latitude' 'passthrough__longitude'
 'passthrough__closest_metro_station_km' 'passthrough__AP_dist_km'
 'passthrough__Aiims_dist_km' 'passthrough__NDRLW_dist_km']


In [24]:
X_train['passthrough__propertyType'] = X_train['passthrough__propertyType'].astype(object)
X_test['passthrough__propertyType'] = X_test['passthrough__propertyType'].astype(object)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [25]:
## Hyperparameter tuninig

In [26]:
dtr = DecisionTreeRegressor(random_state=2023)
params = {'max_depth':[None,2,3,4,5,6,7,8],
          'min_samples_split':np.arange(2, 21),
          'min_samples_leaf':np.arange(2, 21)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
gcv = GridSearchCV(dtr, param_grid=params, cv=kfold, scoring='r2',n_jobs=-1)
gcv.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2023, shuffle=True),
             estimator=DecisionTreeRegressor(random_state=2023), n_jobs=-1,
             param_grid={'max_depth': [None, 2, 3, 4, 5, 6, 7, 8],
                         'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20]),
                         'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20])},
             scoring='r2')

In [27]:
print(gcv.best_params_)
print(gcv.best_score_)

{'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2}
0.6041349951762722


In [29]:
best_model = gcv.best_estimator_
y_pred = best_model.predict(X_test)
print(r2_score(y_test, y_pred))

0.6688981321552083
