In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

In [3]:
import h2o

In [4]:
df=pd.read_csv("../Dataset/ProcessedFile.csv")

In [5]:
X = df.drop(['price','localityName'], axis=1)
y = df['price']

In [6]:
X['propertyType'] = X['propertyType'].map({'Independent Floor':1,
                                           'Apartment':2,
                                           'Independent House':3,
                                           'Villa':4})

In [11]:
ohc = OneHotEncoder()
ct = make_column_transformer((ohc,
       make_column_selector(dtype_include=object)),
       ("passthrough",
        make_column_selector(dtype_include=['int64','float64'])))
X_transf = ct.fit_transform(X).toarray()
X_transf = pd.DataFrame(X_transf, columns=ct.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X_transf, y,test_size = 0.15, 
                                                    random_state=2023,
                                                    stratify=X['propertyType']) 

In [12]:
print(ct.get_feature_names_out())

['onehotencoder__suburbName_Central Delhi'
 'onehotencoder__suburbName_East Delhi'
 'onehotencoder__suburbName_North Delhi'
 'onehotencoder__suburbName_North West Delhi'
 'onehotencoder__suburbName_Other' 'onehotencoder__suburbName_South Delhi'
 'onehotencoder__suburbName_South West Delhi'
 'onehotencoder__suburbName_West Delhi'
 'onehotencoder__companyName_A R Realtors'
 'onehotencoder__companyName_AB Estate'
 'onehotencoder__companyName_AMIT CHHABRA'
 'onehotencoder__companyName_AMPM Realtors'
 'onehotencoder__companyName_AVS Realtors'
 'onehotencoder__companyName_Abhishek yadav'
 'onehotencoder__companyName_Ahuja properties'
 'onehotencoder__companyName_Angel Property Services'
 'onehotencoder__companyName_Arjun Raj'
 'onehotencoder__companyName_Ashish Bansal'
 'onehotencoder__companyName_Ashish Talwar'
 'onehotencoder__companyName_Atul Kumar'
 'onehotencoder__companyName_B Kumar and Brothers'
 'onehotencoder__companyName_Baghla Estates'
 'onehotencoder__companyName_Bhagirathi Estat

In [44]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [125]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from C:\ProgramData\Anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\asus\AppData\Local\Temp\tmpzhi8srjv
  JVM stdout: C:\Users\asus\AppData\Local\Temp\tmpzhi8srjv\h2o_asus_started_from_python.out
  JVM stderr: C:\Users\asus\AppData\Local\Temp\tmpzhi8srjv\h2o_asus_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,"2 years, 9 months and 4 days !!!"
H2O_cluster_name:,H2O_from_python_asus_hbc9jo
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.755 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [126]:
train_h2o = h2o.H2OFrame(train, destination_frame="train")
test_h2o = h2o.H2OFrame(test, destination_frame="test")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [127]:
houserenth2o = train_h2o.rbind(test_h2o)
houserenth2o.nrows

17890

In [128]:
X = houserenth2o.col_names[:119]

In [129]:
y = 'price'

In [130]:
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.random_forest import H2ORandomForestEstimator

In [166]:
rf_params1 = {"ntrees" : [5,10,25,50],
                "max_depth": [5,7,10,12,15,17,20,25],
                "mtries" : [3,4,6,8,10]}

rf_h2o = H2ORandomForestEstimator(seed=2023)
rf_grid1 = H2OGridSearch(model=rf_h2o,
                          grid_id='rf_grid1',
                          hyper_params=rf_params1)

In [167]:
rf_grid1.train(x=X, y=y, training_frame=train_h2o, 
               validation_frame=test_h2o,seed=2023)

drf Grid Build progress: |████████████████████████████████████████████████| 100%


In [168]:
rf_gridperf1 = rf_grid1.get_grid(sort_by="r2",
                                 decreasing=True)

In [183]:
rf_gridperf1

       max_depth mtries ntrees           model_ids                   r2
0             25      8     10  rf_grid1_model_212   0.8672915326425853
1             25      8     10  rf_grid1_model_212   0.8672915326425853
2             20      8     10  rf_grid1_model_163   0.8663632272759416
3             17      8     10  rf_grid1_model_162   0.8640530585840942
4             25      8     25  rf_grid1_model_252   0.8592365278204142
..  ..       ...    ...    ...                 ...                  ...
155            5      3      5    rf_grid1_model_1  0.26250531136529986
156            7      6      5   rf_grid1_model_12   0.2559013621991473
157            7      4      5    rf_grid1_model_7  0.19152446732092598
158            5      4      5    rf_grid1_model_6  0.16675553278991595
159            5      6      5   rf_grid1_model_11  0.16143305844731215

[160 rows x 6 columns]




In [170]:
best_rf1 = rf_gridperf1.models[0]
best_rf1

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf_grid1_model_212


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,10.0,10.0,120596.0,25.0,25.0,25.0,150.0,2186.0,951.8




ModelMetricsRegression: drf
** Reported on train data. **

MSE: 2779891819.244304
RMSE: 52724.67941338576
MAE: 11397.728190672357
RMSLE: 0.43387822923237
Mean Residual Deviance: 2779891819.244304

ModelMetricsRegression: drf
** Reported on validation data. **

MSE: 496473156.2348923
RMSE: 22281.67759022853
MAE: 9192.516167012209
RMSLE: 0.4089145765051987
Mean Residual Deviance: 496473156.2348923

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
0,,2023-03-06 12:05:35,1.589 sec,0.0,,,,,,
1,,2023-03-06 12:05:35,1.618 sec,1.0,70930.770462,15514.385681,5031174000.0,58083.269152,15186.672222,3373666000.0
2,,2023-03-06 12:05:36,1.651 sec,2.0,75411.475452,14383.896378,5686891000.0,47318.791259,12465.001867,2239068000.0
3,,2023-03-06 12:05:36,1.679 sec,3.0,68124.744382,14549.195384,4640981000.0,32682.714592,12103.113297,1068160000.0
4,,2023-03-06 12:05:36,1.700 sec,4.0,59047.34619,14320.482978,3486589000.0,30269.361951,12191.02722,916234300.0
5,,2023-03-06 12:05:36,1.734 sec,5.0,57331.030373,13248.598813,3286847000.0,26867.11643,11088.875504,721841900.0
6,,2023-03-06 12:05:36,1.785 sec,6.0,55401.330899,12592.966015,3069307000.0,26668.863683,10475.535488,711228300.0
7,,2023-03-06 12:05:36,1.821 sec,7.0,49083.266443,12080.582042,2409167000.0,26053.685006,10150.009266,678794500.0
8,,2023-03-06 12:05:36,1.877 sec,8.0,48691.729422,11754.049445,2370885000.0,26097.507226,9720.19556,681079900.0
9,,2023-03-06 12:05:36,1.923 sec,9.0,52788.267005,11730.059515,2786601000.0,26683.354022,9608.91877,712001400.0



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,passthrough__bedrooms,144804600000000.0,1.0,0.257523
1,passthrough__size_sq_ft,113580100000000.0,0.784368,0.201993
2,onehotencoder__companyName_B Kumar and Brothers,95418540000000.0,0.658947,0.169694
3,onehotencoder__companyName_Other,48041140000000.0,0.331765,0.085437
4,passthrough__latitude,32472810000000.0,0.224253,0.05775
5,passthrough__propertyType,26789780000000.0,0.185006,0.047643
6,onehotencoder__suburbName_Central Delhi,23322460000000.0,0.161062,0.041477
7,passthrough__Aiims_dist_km,17469070000000.0,0.120639,0.031067
8,passthrough__closest_metro_station_km,7910158000000.0,0.054626,0.014068
9,passthrough__NDRLW_dist_km,6663181000000.0,0.046015,0.01185



See the whole table with table.as_data_frame()




In [141]:
varimp = best_rf1.varimp(use_pandas=True)
varimp

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,passthrough__bedrooms,1.446182e+14,1.000000,0.258897
1,passthrough__size_sq_ft,1.128921e+14,0.780622,0.202100
2,onehotencoder__companyName_B Kumar and Brothers,9.541782e+13,0.659791,0.170818
3,onehotencoder__companyName_Other,4.799028e+13,0.331841,0.085913
4,passthrough__latitude,3.222598e+13,0.222835,0.057691
...,...,...,...,...
109,onehotencoder__companyName_Viva Estates,0.000000e+00,0.000000,0.000000
110,onehotencoder__companyName_kartik dev,0.000000e+00,0.000000,0.000000
111,onehotencoder__companyName_kunal,0.000000e+00,0.000000,0.000000
112,onehotencoder__companyName_pransu Srivastava,0.000000e+00,0.000000,0.000000


In [175]:
best_rf1.predict(test_h2o)

drf prediction progress: |████████████████████████████████████████████████| 100%


predict
19033.6
240235.0
28121.3
19391.2
23834.4
18344.9
17939.0
20939.6
58142.7
29157.1




In [142]:
from h2o.automl import H2OAutoML

In [155]:
aml = H2OAutoML(max_models=10, seed=2023, stopping_metric="mse", sort_metric="mse")
aml.train(x=X, y=y, training_frame=train_h2o)

AutoML progress: |
11:26:37.914: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [156]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mse,mean_residual_deviance,rmse,mae,rmsle
GBM_1_AutoML_20230306_112637,1431360000.0,1431360000.0,37833.3,8501.21,0.316393
GBM_3_AutoML_20230306_112637,1714910000.0,1714910000.0,41411.5,8474.93,
DRF_1_AutoML_20230306_112637,1742450000.0,1742450000.0,41742.7,7534.34,0.272631
GBM_2_AutoML_20230306_112637,1778310000.0,1778310000.0,42170.0,8677.59,
GBM_4_AutoML_20230306_112637,1810790000.0,1810790000.0,42553.4,8358.0,
XRT_1_AutoML_20230306_112637,1822600000.0,1822600000.0,42692.0,7605.32,0.271002
StackedEnsemble_AllModels_AutoML_20230306_112637,2148860000.0,2148860000.0,46355.7,8441.11,
StackedEnsemble_BestOfFamily_AutoML_20230306_112637,2152990000.0,2152990000.0,46400.3,8400.62,0.299589
GBM_grid__1_AutoML_20230306_112637_model_1,2643720000.0,2643720000.0,51417.2,10112.8,
GBM_5_AutoML_20230306_112637,3146060000.0,3146060000.0,56089.7,9593.91,




In [157]:
# Getting the best model
m = aml.leader.r2
m

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_1_AutoML_20230306_112637


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,67.0,67.0,28291.0,6.0,6.0,6.0,13.0,49.0,28.820896




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 205044393.4791768
RMSE: 14319.371266894954
MAE: 6617.705479128304
RMSLE: 0.2910849959741285
Mean Residual Deviance: 205044393.4791768

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 1431357074.7511427
RMSE: 37833.279989331386
MAE: 8501.212163622666
RMSLE: 0.3163932208884228
Mean Residual Deviance: 1431357074.7511427

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,8501.201,764.7707,8667.24,8430.972,9118.134,9059.268,7230.3936
1,mean_residual_deviance,1431367170.0,504604416.0,1277806850.0,1659327360.0,1168286850.0,2179903230.0,871511620.0
2,mse,1431367170.0,504604416.0,1277806850.0,1659327360.0,1168286850.0,2179903230.0,871511620.0
3,r2,0.7961534,0.07296424,0.82359755,0.82630056,0.6879038,0.87906724,0.763898
4,residual_deviance,1431367170.0,504604416.0,1277806850.0,1659327360.0,1168286850.0,2179903230.0,871511620.0
5,rmse,37374.457,6568.5913,35746.42,40734.844,34180.21,46689.434,29521.375
6,rmsle,0.31580213,0.021590617,0.33438,0.3006136,0.34292656,0.30715987,0.29393056



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2023-03-06 11:27:03,6.468 sec,0.0,91941.351943,23538.856894,8453212000.0
1,,2023-03-06 11:27:03,6.573 sec,5.0,58992.897943,15925.093516,3480162000.0
2,,2023-03-06 11:27:03,6.661 sec,10.0,38586.762814,12136.630307,1488938000.0
3,,2023-03-06 11:27:04,6.745 sec,15.0,28474.131414,10012.741523,810776200.0
4,,2023-03-06 11:27:04,6.832 sec,20.0,22803.419289,8732.35217,519995900.0
5,,2023-03-06 11:27:04,6.921 sec,25.0,19637.100821,8192.801065,385615700.0
6,,2023-03-06 11:27:04,7.013 sec,30.0,17744.223386,7842.801957,314857500.0
7,,2023-03-06 11:27:04,7.105 sec,35.0,16921.436641,7651.193317,286335000.0
8,,2023-03-06 11:27:04,7.192 sec,40.0,16278.449587,7390.908673,264987900.0
9,,2023-03-06 11:27:04,7.281 sec,45.0,15775.322059,7215.582412,248860800.0



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,passthrough__bedrooms,241482300000000.0,1.0,0.392189
1,passthrough__size_sq_ft,174115600000000.0,0.721029,0.28278
2,passthrough__NDRLW_dist_km,71111330000000.0,0.294478,0.115491
3,onehotencoder__suburbName_Central Delhi,19403040000000.0,0.08035,0.031512
4,passthrough__latitude,17886960000000.0,0.074072,0.02905
5,onehotencoder__companyName_Other,17751670000000.0,0.073511,0.02883
6,passthrough__AP_dist_km,15173880000000.0,0.062836,0.024644
7,onehotencoder__companyName_B Kumar and Brothers,14696900000000.0,0.060861,0.023869
8,passthrough__longitude,9991594000000.0,0.041376,0.016227
9,passthrough__propertyType,5416201000000.0,0.022429,0.008796



See the whole table with table.as_data_frame()




In [181]:
preds = aml.leader.predict(test_h2o)
preds

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict
17495.6
59010.1
24866.3
13508.7
23700.8
13017.1
13887.4
20145.3
54326.1
30434.8




In [124]:
#h2o.cluster().shutdown()

H2O session _sid_aa1c closed.
