# IV.1. Установка пакета h2o

В Python устанавливаем пакет h2o с помощью строки ```pip install h2o```.

# IV.2. Запуск кластера H2O

In [1]:
# импортируем необходимые библиотеки
import h2o
import os
h2o.init(nthreads=-1, max_mem_size=8)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_202"; Java(TM) SE Runtime Environment (build 1.8.0_202-b08); Java HotSpot(TM) 64-Bit Server VM (build 25.202-b08, mixed mode)
  Starting server from /anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/y_/s7c_myjd7qg6zs3hcfflpgwr0000gn/T/tmpspnvn55j
  JVM stdout: /var/folders/y_/s7c_myjd7qg6zs3hcfflpgwr0000gn/T/tmpspnvn55j/h2o_artemgruzdev_started_from_python.out
  JVM stderr: /var/folders/y_/s7c_myjd7qg6zs3hcfflpgwr0000gn/T/tmpspnvn55j/h2o_artemgruzdev_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Europe/Moscow
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.5
H2O cluster version age:,2 months and 23 days
H2O cluster name:,H2O_from_python_artemgruzdev_fs6cgs
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


# IV.3. Преобразование данных во фреймы H2O

## IV.3.1. Получение фреймов H2O из датафреймов pandas

In [2]:
# импортируем библиотеки pandas и numpy
import pandas as pd
import numpy as np

In [3]:
# записываем CSV-файл в объект DataFrame
data = pd.read_csv('Data/StateFarm_for_H2O.csv', sep=';')
data.head(3)

Unnamed: 0,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Response
0,65999,237,1,14,0,6,0
1,0,65,19,56,0,3,0
2,54500,63,28,17,0,6,0


In [4]:
# импортируем функцию train_test_split(), с помощью
# которой разбиваем данные на обучающие и тестовые
from sklearn.model_selection import train_test_split
# разбиваем данные на обучающие и тестовые: получаем обучающий
# массив признаков, тестовый массив признаков, обучающий массив
# меток, тестовый массив меток
X_train, X_test, y_train, y_test = train_test_split(data.drop('Response', axis=1), 
                                                    data['Response'], 
                                                    test_size=0.3,
                                                    stratify=data['Response'],
                                                    random_state=42)

In [5]:
# конкатенируем обучающие массив признаков и массив меток
train = pd.concat([X_train, y_train], axis=1)
# конкатенируем тестовые массив признаков и массив меток
test = pd.concat([X_test, y_test], 
                  axis=1)
# преобразовываем датафреймы pandas во фреймы h2o
tr = h2o.H2OFrame(train)
tst = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


## IV.3.2. Получение фреймов H2O напрямую

In [6]:
# загружаем данные в формате фрейма
data = h2o.upload_file(path='Data/StateFarm_for_H2O.csv') 

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
# разбиваем фрейм на обучающий и тестовый
tr, tst = data.split_frame(ratios=[.7], seed=42)

# IV.4. Знакомство с содержимым фрейма

In [8]:
# смотрим содержимое фрейма
tr.describe()

Rows:5799
Cols:7




Unnamed: 0,﻿Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Response
type,int,int,int,int,int,int,int
mins,0.0,61.0,0.0,0.0,0.0,1.0,0.0
mean,37865.85031902055,93.31401965856195,15.117261596827044,48.28694602517683,0.3821348508363507,2.990170719089499,0.09949991377823762
maxs,99961.0,298.0,35.0,99.0,5.0,9.0,1.0
sigma,30409.096303369428,34.75395002180724,10.092161225345725,27.74697967899083,0.9094184627653707,2.4063142665913926,0.29935787025903
zeros,1468,0,206,58,4605,0,5222
missing,0,0,0,0,0,0,0
0,65999.0,237.0,1.0,14.0,0.0,6.0,0.0
1,0.0,65.0,19.0,56.0,0.0,3.0,0.0
2,54500.0,63.0,28.0,17.0,0.0,6.0,0.0


In [9]:
# преобразовываем в категориальную переменную
tr['Response'] = tr['Response'].asfactor()
tst['Response'] = tst['Response'].asfactor()

In [10]:
# смотрим содержимое фрейма
tr.describe()

Rows:5799
Cols:7




Unnamed: 0,﻿Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Response
type,int,int,int,int,int,int,enum
mins,0.0,61.0,0.0,0.0,0.0,1.0,
mean,37865.85031902055,93.31401965856195,15.117261596827044,48.28694602517683,0.3821348508363507,2.990170719089499,
maxs,99961.0,298.0,35.0,99.0,5.0,9.0,
sigma,30409.096303369428,34.75395002180724,10.092161225345725,27.74697967899083,0.9094184627653707,2.4063142665913926,
zeros,1468,0,206,58,4605,0,
missing,0,0,0,0,0,0,0
0,65999.0,237.0,1.0,14.0,0.0,6.0,0
1,0.0,65.0,19.0,56.0,0.0,3.0,0
2,54500.0,63.0,28.0,17.0,0.0,6.0,0


# IV.5. Определение имени зависимой переменной и списка имен предикторов

In [11]:
# задаем имя зависимой переменной
dependent = 'Response'
# задаем список имен предикторов
predictors = list(tr.columns)
# удаляем имя зависимой переменной 
# из списка имен предикторов
predictors.remove(dependent)

# IV.6. Обучение модели машинного обучения

In [12]:
# импортируем класс H2ORandomForestEstimator
from h2o.estimators import H2ORandomForestEstimator
# создаем экземпляр класса H2ORandomForestEstimator
forest_model = H2ORandomForestEstimator(seed=42)
# обучаем модель
forest_model.train(x = predictors, 
                   y = dependent, 
                   training_frame=tr, 
                   validation_frame=tst)

drf Model Build progress: |███████████████████████████████████████████████| 100%


# IV.7. Вывод модели

In [13]:
# смотрим модель
forest_model

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1559051017131_1


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.04619564763372571
RMSE: 0.21493172784334497
LogLoss: 0.3693878012757886
Mean Per-Class Error: 0.07330123122610843
AUC: 0.9457811472194362
pr_auc: 0.6112064384131378
Gini: 0.8915622944388724
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3176226465604006: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,4949.0,273.0,0.0523,(273.0/5222.0)
1,70.0,507.0,0.1213,(70.0/577.0)
Total,5019.0,780.0,0.0591,(343.0/5799.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3176226,0.7472366,210.0
max f2,0.2579545,0.8301827,232.0
max f0point5,0.3631774,0.6956771,196.0
max accuracy,0.3631774,0.9424039,196.0
max precision,0.8279956,0.75,51.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9961700,0.0
max absolute_mcc,0.3123677,0.7250168,211.0
max min_per_class_accuracy,0.2271789,0.9243585,245.0


Gains/Lift Table: Avg response rate:  9,95 %, avg score: 13,38 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0117262,1.0,7.0943012,7.0943012,0.7058824,1.0,0.7058824,1.0,0.0831889,0.0831889,609.4301152,609.4301152
,2,0.0200034,0.9432323,6.4907929,6.8445736,0.6458333,0.9602896,0.6810345,0.9835681,0.0537262,0.1369151,549.0792894,584.4573597
,3,0.0300052,0.9001071,8.1441762,7.2777745,0.8103448,0.9243778,0.7241379,0.9638380,0.0814558,0.2183709,714.4176179,627.7774458
,4,0.0400069,0.8258451,8.3174565,7.5376950,0.8275862,0.8672261,0.75,0.9396850,0.0831889,0.3015598,731.7456523,653.7694974
,5,0.0500086,0.7772514,6.5846531,7.3470866,0.6551724,0.8013156,0.7310345,0.9120111,0.0658579,0.3674177,558.4653081,634.7086595
,6,0.1000172,0.5281046,6.4460288,6.8965577,0.6413793,0.6488410,0.6862069,0.7804261,0.3223570,0.6897747,544.6028805,589.6557700
,7,0.1500259,0.2543860,4.4706329,6.0879161,0.4448276,0.3752086,0.6057471,0.6453536,0.2235702,0.9133449,347.0632881,508.7916094
,8,0.2000345,0.1625000,0.7277774,4.7478814,0.0724138,0.2004464,0.4724138,0.5341268,0.0363951,0.9497400,-27.2222554,374.7881432
,9,0.3002242,0.0916667,0.0518946,3.1807543,0.0051635,0.1207976,0.3164848,0.3961921,0.0051993,0.9549393,-94.8105370,218.0754303




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.047934206502402175
RMSE: 0.21893881908515486
LogLoss: 0.30493459250887994
Mean Per-Class Error: 0.06672751920469944
AUC: 0.9276565747853592
pr_auc: 0.6437478132835207
Gini: 0.8553131495707185
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3389321720600128: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,2100.0,113.0,0.0511,(113.0/2213.0)
1,28.0,222.0,0.112,(28.0/250.0)
Total,2128.0,335.0,0.0572,(141.0/2463.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3389322,0.7589744,140.0
max f2,0.2700000,0.8498896,156.0
max f0point5,0.4227706,0.7002706,126.0
max accuracy,0.3389322,0.9427527,140.0
max precision,0.9893333,0.9285714,1.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9995481,0.0
max absolute_mcc,0.2700000,0.7412910,156.0
max min_per_class_accuracy,0.2183333,0.9286037,177.0


Gains/Lift Table: Avg response rate: 10,15 %, avg score: 13,80 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0105562,0.9735000,8.3363077,8.3363077,0.8461538,0.9899583,0.8461538,0.9899583,0.088,0.088,733.6307692,733.6307692
,2,0.0203004,0.9630032,7.7995,8.07864,0.7916667,0.9681774,0.82,0.9795035,0.076,0.164,679.95,707.864
,3,0.0304507,0.9390476,6.6993600,7.61888,0.68,0.9505371,0.7733333,0.9698480,0.068,0.232,569.9360000,661.888
,4,0.0406009,0.8690909,5.9112,7.19196,0.6,0.9005627,0.73,0.9525267,0.06,0.292,491.12,619.196
,5,0.0503451,0.8178333,5.3365,6.8328387,0.5416667,0.8455708,0.6935484,0.9318255,0.052,0.344,433.65,583.2838710
,6,0.1002842,0.5422468,6.6480976,6.7408421,0.6747967,0.6722613,0.6842105,0.8025688,0.332,0.676,564.8097561,574.0842105
,7,0.1502233,0.2449857,5.0461463,6.1774703,0.5121951,0.3957178,0.6270270,0.6673183,0.252,0.928,404.6146341,517.7470270
,8,0.2001624,0.1424000,0.3203902,4.7161704,0.0325203,0.1846172,0.4787018,0.5468878,0.016,0.944,-67.9609756,371.6170385
,9,0.3000406,0.0803905,0.0,3.1462409,0.0,0.1077914,0.3193505,0.4007204,0.0,0.944,-100.0,214.6240866



Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
,2019-05-28 16:43:42,0.033 sec,0.0,,,,,,,,,,,,
,2019-05-28 16:43:42,0.187 sec,1.0,0.2835583,2.0301177,0.8700494,0.2597914,5.1112751,0.0983683,0.3008948,2.3641222,0.8563380,0.2191648,4.9048584,0.1075924
,2019-05-28 16:43:42,0.245 sec,2.0,0.2870094,2.0021227,0.8598967,0.2765002,5.0445320,0.1052934,0.2595292,0.9381972,0.9027817,0.3357621,7.0606000,0.0747056
,2019-05-28 16:43:42,0.288 sec,3.0,0.2806831,1.7491134,0.8786615,0.2806030,5.3644474,0.1046054,0.2491108,0.7245359,0.9217695,0.3950726,7.19196,0.0795778
,2019-05-28 16:43:42,0.313 sec,4.0,0.2814015,1.6574366,0.8771750,0.2875929,5.5532968,0.1073481,0.2405474,0.6545105,0.9272833,0.4408216,7.2474483,0.0775477
---,---,---,---,---,---,---,---,---,---,---,---,---,---,---,---
,2019-05-28 16:43:44,2.308 sec,46.0,0.2157715,0.3860260,0.9445060,0.6089056,6.8916068,0.0593206,0.2187688,0.3044243,0.9280678,0.6388261,7.5784615,0.0576533
,2019-05-28 16:43:44,2.363 sec,47.0,0.2156896,0.3860089,0.9443872,0.6091984,6.8916068,0.0598379,0.2187897,0.3041223,0.9280615,0.6434493,8.3363077,0.0596833
,2019-05-28 16:43:44,2.418 sec,48.0,0.2153179,0.3801665,0.9452918,0.6106247,6.8916068,0.0594930,0.2187701,0.3042038,0.9279892,0.6436119,8.3363077,0.0576533



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
﻿Income,3106.9572754,1.0,0.2398379
Months Since Policy Inception,2907.6789551,0.9358606,0.2244549
Monthly Premium Auto,2650.9870605,0.8532422,0.2046398
Months Since Last Claim,2577.0380859,0.8294411,0.1989314
Number of Policies,1133.2757568,0.3647542,0.0874819
Number of Open Complaints,578.4661255,0.1861841,0.0446540




# IV.8. Получение прогнозов

In [14]:
# получаем спрогнозированные значения и спрогнозированные 
# вероятности классы зависимой переменной
predictions = forest_model.predict(tst)
predictions

drf prediction progress: |████████████████████████████████████████████████| 100%


predict,p0,p1
1,0.542,0.458
0,0.928095,0.0719048
0,0.875667,0.124333
0,0.933,0.067
0,0.888,0.112
0,1.0,0.0
0,0.931667,0.0683333
0,0.96,0.04
0,0.808,0.192
0,1.0,0.0




In [15]:
# завершаем работу с H2O
h2o.cluster().shutdown()

H2O session _sid_970e closed.
