In [1]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
h2o.init()


# import the cars dataset:
# this dataset is used to classify whether or not a car is economical based on
# the car's displacement, power, weight, and acceleration, and the year it was made
cars = h2o.import_file("Lending_TRAINING_DATA.csv")

# convert response column to a factor
cars["APPLICATION.STATUS"] = cars["APPLICATION.STATUS"].asfactor()

# set the predictor names and the response column name
predictors = ['APPLICATION.ID', 'DSA.ID', 'DEALER.ID', 'APP.DATE', 'TIME.STAMP',
       'APPLICATION.STATUS', 'QUEUE.ID', 'CURRENT.STAGE', 'MARITAL.STATUS',
       'GENDER', 'AGE', 'EDUCATION', 'RESIDENCE.TYPE', 'CITY', 'STATE',
       'ZIP.CODE', 'EMPLOY.CONSTITUTION', 'NET.TAKE.HOME.SALARY', 'PAN.STATUS',
       'CIBIL.SCORE', 'APPLICATION.SCORE', 'RESIDENTIAL.ADDRESS.SCORE',
       'OFFICE.ADDRESS.SCORE', 'NAME.SCORE', 'APPROVED.AMOUNT', 'ASSET.MAKE',
       'ASSET.CTG', 'APPLIED.AMOUNT', 'LOAN.TENOR', 'LOAN.TYPE',
       'DEDUPE.REF.ID1', 'CAR.CATEGORY', 'OWN.HOUSE.TYPE',
       'HOUSE.SURROGATE.DOCUMENT.TYPE', 'TRADER.YEAR.IN.BUSINESS',
       'TRADER.BUSINESS.PROOF', 'CREDIT.CARD.NUMBER', 'CREDIT.CARDS.CATEGORY',
       'PRIMARY.ASSET.CTG', 'PRIMARY.ASSET.MAKE', 'PRIMARY.ASSET.MODELNO',
       'VOTER_ID', 'DRIVING_LICENSE', 'AADHAAR', 'PAN', 'BANK_PASSBOOK']
response = "APPLICATION.STATUS"

# split into train and validation sets
train, valid = cars.split_frame(ratios = [.8], seed = 1234)

# try using the binomial_double_trees (boolean parameter):
# Initialize and train a DRF
cars_drf = H2ORandomForestEstimator(binomial_double_trees = False, seed = 1234)
cars_drf.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
cars_drf.confusion_matrix()
# or specify the validation frame
cars_drf.confusion_matrix(valid=True)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 13+33, mixed mode, sharing)
  Starting server from C:\Users\Shruti\anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Shruti\AppData\Local\Temp\tmpctob7kk5
  JVM stdout: C:\Users\Shruti\AppData\Local\Temp\tmpctob7kk5\h2o_Shruti_started_from_python.out
  JVM stderr: C:\Users\Shruti\AppData\Local\Temp\tmpctob7kk5\h2o_Shruti_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,8 months and 17 days !!!
H2O_cluster_name:,H2O_from_python_Shruti_le4bdr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1002 Mb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


Parse progress: |█████████████████████████████████████████████████████████| 100%




drf Model Build progress: |███████████████████████████████████████████████| 100%

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5258653457462787: 


Unnamed: 0,Unnamed: 1,Approved,Declined,Error,Rate
0,Approved,891.0,1.0,0.0011,(1.0/892.0)
1,Declined,11.0,637.0,0.017,(11.0/648.0)
2,Total,902.0,638.0,0.0078,(12.0/1540.0)




In [3]:

# retrieve the AUC for both the training and validation data:
cars_drf.auc(train=True, valid=True, xval=False)

{'train': 0.9993238973640542, 'valid': 0.9985605934783812}

In [4]:
cars_drf.auc()



0.9993238973640542

In [5]:
model_path = h2o.save_model(model=cars_drf, path="E:\\DataScience\\assignment-dataset\\h2o", force=True)
print(model_path)

E:\DataScience\assignment-dataset\h2o\DRF_model_python_1642262154554_1


In [6]:
saved_model = h2o.load_model(model_path)
saved_model

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1642262154554_1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,111054.0,1.0,20.0,12.32,2.0,194.0,83.6




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.02796386838711827
RMSE: 0.16722400661124667
LogLoss: 0.13955740676931863
Mean Per-Class Error: 0.012045207836418825
AUC: 0.9993238973640542
AUCPR: 0.9992218450326109
Gini: 0.9986477947281085

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4947211237171747: 


Unnamed: 0,Unnamed: 1,Approved,Declined,Error,Rate
0,Approved,3517.0,28.0,0.0079,(28.0/3545.0)
1,Declined,44.0,2630.0,0.0165,(44.0/2674.0)
2,Total,3561.0,2658.0,0.0116,(72.0/6219.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.494721,0.986497,195.0
1,max f2,0.411391,0.989665,216.0
2,max f0point5,0.591953,0.991942,175.0
3,max accuracy,0.513816,0.988423,191.0
4,max precision,0.999736,1.0,0.0
5,max recall,0.058587,1.0,359.0
6,max specificity,0.999736,1.0,0.0
7,max absolute_mcc,0.513816,0.976401,191.0
8,max min_per_class_accuracy,0.464625,0.987285,202.0
9,max mean_per_class_accuracy,0.467553,0.987955,201.0



Gains/Lift Table: Avg response rate: 43.00 %, avg score: 42.72 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01013,0.972775,2.325729,2.325729,1.0,0.980752,1.0,0.980752,0.02356,0.02356,132.572924,132.572924,0.02356
1,2,0.0201,0.964552,2.325729,2.325729,1.0,0.968631,1.0,0.97474,0.023186,0.046746,132.572924,132.572924,0.046746
2,3,0.030069,0.959195,2.325729,2.325729,1.0,0.961634,1.0,0.970394,0.023186,0.069933,132.572924,132.572924,0.069933
3,4,0.040039,0.953293,2.325729,2.325729,1.0,0.955906,1.0,0.966787,0.023186,0.093119,132.572924,132.572924,0.093119
4,5,0.050008,0.947106,2.325729,2.325729,1.0,0.950387,1.0,0.963518,0.023186,0.116305,132.572924,132.572924,0.116305
5,6,0.100016,0.926789,2.325729,2.325729,1.0,0.937061,1.0,0.950289,0.116305,0.23261,132.572924,132.572924,0.23261
6,7,0.150024,0.908124,2.325729,2.325729,1.0,0.917002,1.0,0.939193,0.116305,0.348915,132.572924,132.572924,0.348915
7,8,0.200032,0.887982,2.325729,2.325729,1.0,0.898355,1.0,0.928984,0.116305,0.465221,132.572924,132.572924,0.465221
8,9,0.300048,0.836778,2.325729,2.325729,1.0,0.864855,1.0,0.907608,0.23261,0.697831,132.572924,132.572924,0.697831
9,10,0.400064,0.687084,2.325729,2.325729,1.0,0.787072,1.0,0.877474,0.23261,0.930441,132.572924,132.572924,0.930441




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.02710301789129399
RMSE: 0.16462994226839173
LogLoss: 0.14116836352858259
Mean Per-Class Error: 0.009048192437579594
AUC: 0.9985605934783812
AUCPR: 0.9988848308908762
Gini: 0.9971211869567624

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5258653457462787: 


Unnamed: 0,Unnamed: 1,Approved,Declined,Error,Rate
0,Approved,891.0,1.0,0.0011,(1.0/892.0)
1,Declined,11.0,637.0,0.017,(11.0/648.0)
2,Total,902.0,638.0,0.0078,(12.0/1540.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.525865,0.990669,188.0
1,max f2,0.453923,0.991685,199.0
2,max f0point5,0.525865,0.995313,188.0
3,max accuracy,0.525865,0.992208,188.0
4,max precision,0.972386,1.0,0.0
5,max recall,0.020544,1.0,388.0
6,max specificity,0.972386,1.0,0.0
7,max absolute_mcc,0.525865,0.984068,188.0
8,max min_per_class_accuracy,0.469781,0.988789,196.0
9,max mean_per_class_accuracy,0.525865,0.990952,188.0



Gains/Lift Table: Avg response rate: 42.08 %, avg score: 41.89 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.01039,0.951016,2.376543,2.376543,1.0,0.956782,1.0,0.956782,0.024691,0.024691,137.654321,137.654321,0.024691
1,2,0.02013,0.945988,2.376543,2.376543,1.0,0.94843,1.0,0.952741,0.023148,0.04784,137.654321,137.654321,0.04784
2,3,0.030519,0.942617,2.376543,2.376543,1.0,0.944528,1.0,0.949945,0.024691,0.072531,137.654321,137.654321,0.072531
3,4,0.04026,0.937933,2.376543,2.376543,1.0,0.940152,1.0,0.947576,0.023148,0.095679,137.654321,137.654321,0.095679
4,5,0.05,0.934387,2.376543,2.376543,1.0,0.935648,1.0,0.945252,0.023148,0.118827,137.654321,137.654321,0.118827
5,6,0.1,0.914151,2.376543,2.376543,1.0,0.923609,1.0,0.93443,0.118827,0.237654,137.654321,137.654321,0.237654
6,7,0.15,0.902066,2.376543,2.376543,1.0,0.908435,1.0,0.925765,0.118827,0.356481,137.654321,137.654321,0.356481
7,8,0.2,0.887619,2.376543,2.376543,1.0,0.895718,1.0,0.918254,0.118827,0.475309,137.654321,137.654321,0.475309
8,9,0.300649,0.835077,2.376543,2.376543,1.0,0.864653,1.0,0.900309,0.239198,0.714506,137.654321,137.654321,0.714506
9,10,0.4,0.641595,2.376543,2.376543,1.0,0.774453,1.0,0.86905,0.236111,0.950617,137.654321,137.654321,0.950617




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2022-01-15 21:26:18,0.240 sec,0.0,,,,,,,,,,,,
1,,2022-01-15 21:26:19,0.840 sec,1.0,0.200221,0.598747,0.976784,0.965008,2.2727,0.047851,0.210652,0.729823,0.971915,0.957667,2.31285,0.053896
2,,2022-01-15 21:26:19,1.281 sec,2.0,0.27306,0.924901,0.952565,0.924854,2.207792,0.09201,0.230295,0.203208,0.986791,0.98475,2.376543,0.049351
3,,2022-01-15 21:26:19,1.496 sec,3.0,0.253166,0.596836,0.965417,0.947883,2.258641,0.078824,0.208284,0.184844,0.992291,0.99058,2.376543,0.031169
4,,2022-01-15 21:26:19,1.611 sec,4.0,0.26142,0.442988,0.965622,0.944227,2.243403,0.082648,0.221968,0.192123,0.993389,0.991675,2.376543,0.029221
5,,2022-01-15 21:26:20,1.840 sec,5.0,0.232804,0.357439,0.977354,0.963484,2.270972,0.06158,0.193653,0.160758,0.995703,0.995276,2.376543,0.021429
6,,2022-01-15 21:26:20,2.074 sec,6.0,0.234826,0.363632,0.977546,0.968291,2.287066,0.062511,0.198046,0.163839,0.995029,0.994564,2.376543,0.024026
7,,2022-01-15 21:26:20,2.303 sec,7.0,0.226109,0.297295,0.982238,0.975369,2.298042,0.05471,0.190563,0.156011,0.996388,0.996214,2.376543,0.016883
8,,2022-01-15 21:26:20,2.569 sec,8.0,0.219978,0.289734,0.984068,0.976409,2.290491,0.049058,0.184951,0.150939,0.996631,0.996617,2.376543,0.016883
9,,2022-01-15 21:26:21,2.774 sec,9.0,0.230568,0.291841,0.983734,0.975403,2.284493,0.048406,0.195511,0.173866,0.996946,0.996959,2.376543,0.013636



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,CURRENT.STAGE,12817.174805,1.0,0.277856
1,CIBIL.SCORE,11897.232422,0.928226,0.257913
2,APPROVED.AMOUNT,11690.767578,0.912117,0.253437
3,DSA.ID,3511.505127,0.273969,0.076124
4,CITY,1454.460815,0.113477,0.03153
5,QUEUE.ID,1326.90271,0.103525,0.028765
6,APPLICATION.SCORE,441.451111,0.034442,0.00957
7,RESIDENCE.TYPE,324.916687,0.02535,0.007044
8,PRIMARY.ASSET.MAKE,310.486237,0.024224,0.006731
9,HOUSE.SURROGATE.DOCUMENT.TYPE,242.209534,0.018897,0.005251



See the whole table with table.as_data_frame()




In [6]:
cars1 = h2o.import_file(path="Lending_TEST_DATA.csv")

cars1

Parse progress: |█████████████████████████████████████████████████████████| 100%


APPLICATION.ID,DSA.ID,DEALER.ID,APP.DATE,TIME.STAMP,APPLICATION.STATUS,QUEUE.ID,CURRENT.STAGE,MARITAL.STATUS,GENDER,AGE,EDUCATION,RESIDENCE.TYPE,CITY,STATE,ZIP.CODE,EMPLOY.CONSTITUTION,NET.TAKE.HOME.SALARY,PAN.STATUS,CIBIL.SCORE,APPLICATION.SCORE,RESIDENTIAL.ADDRESS.SCORE,OFFICE.ADDRESS.SCORE,NAME.SCORE,APPROVED.AMOUNT,ASSET.MAKE,ASSET.CTG,APPLIED.AMOUNT,LOAN.TENOR,LOAN.TYPE,DEDUPE.REF.ID1,CAR.CATEGORY,OWN.HOUSE.TYPE,HOUSE.SURROGATE.DOCUMENT.TYPE,TRADER.YEAR.IN.BUSINESS,TRADER.BUSINESS.PROOF,CREDIT.CARD.NUMBER,CREDIT.CARDS.CATEGORY,PRIMARY.ASSET.CTG,PRIMARY.ASSET.MAKE,PRIMARY.ASSET.MODELNO,VOTER_ID,DRIVING_LICENSE,AADHAAR,PAN,BANK_PASSBOOK
26009000000.0,AR69369,26009,2017-02-21 00:00:00,1970-01-01 22:46:47,,Straight Through Process,LOS_DISB,Married,Male,29,UNDER GRADUATE,RENTED-BUNGLOW,SURAT,GUJARAT,395006,SELF-EMPLOYED,,Pan Not Submitted,776,65,0.0,0.0,,20000.0,LLOYD,REFRIGERATORS,20000,10,Consumer Durables,,,,,,,,,REFRIGERATORS,LLOYD,LFR330SS 330L,F,F,T,F,F
27207000000.0,SB44646,27207,2017-02-21 00:00:00,1970-01-01 22:29:52,,Straight Through Process,LOS_DISB,Married,Male,48,UNDER GRADUATE,OWNED-CHAWL,KOLKATTA,WEST BENGAL,700066,SELF-EMPLOYED,,Pan Not Submitted,796,60,76.47,88.03,,17400.0,SAMSUNG,REF - DC,17400,12,Consumer Durables,,,Self Owned,Utility Bill,,,,,REFRIGERATORS,WHIRLPOOL,195 MP CLS 4S WINE,F,F,T,F,T
27233000000.0,AS01544,27233,2017-02-21 00:00:00,1970-01-01 22:27:32,,Straight Through Process,DCLN,Married,Male,33,OTHERS,OWNED-BUNGLOW,PUNE,MAHARASHTRA,411048,SELF-EMPLOYED,,Pan Not Submitted,0,38,87.8,64.52,,,,,32000,12,Consumer Durables,,,Self Owned,Utility Bill,,,,,PANEL - LED,PHILIPS,39PFL3850,F,F,T,F,F
26009000000.0,AR69369,26009,2017-02-21 00:00:00,1970-01-01 22:21:29,,Straight Through Process,LOS_DISB,Married,Male,31,UNDER GRADUATE,OWNED-BUNGLOW,SURAT,GUJARAT,394107,SALARIED,,ERROR,817,73,84.0,0.0,,35300.0,LLOYD,AIR CONDITIONER,35300,10,Consumer Durables,,,Parent Owned,Utility Bill,,,,,AIR CONDITIONER,LLOYD,LS24A5LX 5 STAR 2.0 TON,F,F,F,T,F
26009000000.0,AR69369,26009,2017-02-21 00:00:00,1970-01-01 22:04:49,,Straight Through Process,INV_GNR,Single,Male,24,UNDER GRADUATE,RENTED-FLAT,SURAT,GUJARAT,394107,SALARIED,,ERROR,804,115,0.0,0.0,,40600.0,INTEX,PANEL - LED,40600,10,Consumer Durables,,,,,,,,,TELEVISION,INTEX,LED-4000FHD 1321-3333-1,F,F,T,T,F
26403000000.0,RK43194,26403,2017-02-21 00:00:00,1970-01-01 21:53:52,,Straight Through Process,LOS_DISB,Married,Male,41,GRADUATE,OWNED-PENTHOUSE,UJJAIN,MADHYA PRADESH,456010,SELF-EMPLOYED,,Pan Not Submitted,803,56,0.0,0.0,,45000.0,MITSUBISHI,AIR CONDITIONER,45000,12,Consumer Durables,,,Self Owned,Utility Bill,,,,,AIR CONDITIONER,MITSUBISHI,MS MU HK30VA ( R410 A ),F,F,T,F,F
25556000000.0,RD02622,25556,2017-02-21 00:00:00,1970-01-01 21:50:56,,Straight Through Process,DCLN,Married,Male,22,GRADUATE,RENTED-ROWHOUSE,BHOPAL,MADHYA PRADESH,462042,SALARIED,26000.0,Pan Not Submitted,774,42,0.0,0.0,,,,,36000,12,Consumer Durables,,,,,,,,,PANEL - LED,LG,43LH520T,F,F,T,F,F
25594000000.0,KP77551,25594,2017-02-21 00:00:00,1970-01-01 21:48:34,,Straight Through Process,PD_DE,Single,Male,23,POST-GRADUATE,OWNED-CHAWL,AHMEDABAD,GUJARAT,382403,SELF-EMPLOYED,,ERROR,736,40,68.8,40.62,,,,,34000,10,Consumer Durables,,,Parent Owned,Utility Bill,,,,,REFRIGERATORS,LG,GL-P322RPJL 310 LTRS OLD MODELS,F,F,F,T,F
27644000000.0,DK69431,27644,2017-02-21 00:00:00,1970-01-01 21:44:18,,Under.Writer,PD_DE,Married,Male,32,GRADUATE,PARENT OWNED-FLAT,DURGAPUR STEEL TOWN WEST,WEST BENGAL,713204,SALARIED,,Pan Not Submitted,778,60,76.07,0.0,,,,,15000,9,Consumer Durables,,,,,,,,,PANEL - LED,SAMSUNG,UA24H4003ARLXL,T,F,F,F,T
27401000000.0,AS09702,27401,2017-02-21 00:00:00,1970-01-01 21:31:14,,Straight Through Process,PD_DE,Married,Male,51,UNDER GRADUATE,OWNED-FLAT,KALYANCITY HO,MAHARASHTRA,421301,SALARIED,,Pan Not Submitted,674,107,84.18,0.0,,,,,14250,10,Consumer Durables,,,Self Owned,Property Tax Bill,,,,,WASHING MACHINE,SAMSUNG,WA62H3H5QRP-TL 6.2 KG,F,F,T,F,F




In [8]:
sa=saved_model.predict(cars1)
sa.head(100)

drf prediction progress: |████████████████████████████████████████████████| 100%


predict,Approved,Declined
Approved,0.946134,0.0538663
Approved,0.993457,0.00654273
Declined,0.0542337,0.945766
Approved,0.993457,0.00654273
Approved,0.935726,0.0642745
Approved,0.993457,0.00654273
Declined,0.269711,0.730289
Declined,0.143109,0.856891
Approved,0.497582,0.502418
Declined,0.0920579,0.907942




In [9]:
ca=cars1['APPLICATION.ID'].cbind(sa)
ca

APPLICATION.ID,predict,Approved,Declined
26009000000.0,Approved,0.946134,0.0538663
27207000000.0,Approved,0.993457,0.00654273
27233000000.0,Declined,0.0542337,0.945766
26009000000.0,Approved,0.993457,0.00654273
26009000000.0,Approved,0.935726,0.0642745
26403000000.0,Approved,0.993457,0.00654273
25556000000.0,Declined,0.269711,0.730289
25594000000.0,Declined,0.143109,0.856891
27644000000.0,Approved,0.497582,0.502418
27401000000.0,Declined,0.0920579,0.907942


