## AutoSklearn H2O Sample Code

In [1]:
!pip install h2o

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Regression

#### Load Data

In [3]:
cars = pd.read_csv('cars.csv')
cars

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,body_type,has_warranty,state,drivetrain,price_usd,is_exchangeable,number_of_photos,up_counter,duration_listed
0,Subaru,Outback,automatic,silver,190000,2012,gasoline,False,gasoline,2.5,universal,False,owned,all,15260.000,False,4.5,6.5,8.0
1,Subaru,Outback,automatic,blue,290000,2004,gasoline,False,gasoline,3.0,universal,False,owned,all,7000.000,True,6.0,27.0,41.5
2,Subaru,Forester,automatic,red,402000,2003,gasoline,False,gasoline,2.5,suv,False,owned,all,3920.000,True,2.0,36.0,75.5
3,Subaru,Impreza,mechanical,blue,10000,2001,gasoline,False,gasoline,3.0,sedan,False,owned,all,13998.600,True,4.5,21.0,43.0
4,Subaru,Legacy,automatic,black,280000,2003,gasoline,False,gasoline,2.5,universal,False,owned,all,2987.754,True,7.0,3.5,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,Chrysler,300,automatic,silver,290000,2002,gasoline,False,gasoline,3.5,sedan,False,owned,front,3850.000,True,2.5,42.5,150.5
38527,Chrysler,PT Cruiser,mechanical,blue,321000,2006,diesel,False,diesel,2.2,hatchback,False,owned,front,6720.000,True,2.0,10.0,158.5
38528,Chrysler,300,automatic,blue,777957,2002,gasoline,False,gasoline,3.5,sedan,False,owned,front,6020.000,False,1.5,31.5,184.5
38529,Chrysler,PT Cruiser,mechanical,black,20000,2003,gasoline,False,gasoline,2.0,minivan,False,owned,front,5600.000,True,3.5,78.0,245.0


In [4]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38531 entries, 0 to 38530
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer_name  38531 non-null  object 
 1   model_name         38531 non-null  object 
 2   transmission       38531 non-null  object 
 3   color              38531 non-null  object 
 4   odometer_value     38531 non-null  int64  
 5   year_produced      38531 non-null  int64  
 6   engine_fuel        38531 non-null  object 
 7   engine_has_gas     38531 non-null  bool   
 8   engine_type        38531 non-null  object 
 9   engine_capacity    38521 non-null  float64
 10  body_type          38531 non-null  object 
 11  has_warranty       38531 non-null  bool   
 12  state              38531 non-null  object 
 13  drivetrain         38531 non-null  object 
 14  price_usd          38531 non-null  float64
 15  is_exchangeable    38531 non-null  bool   
 16  number_of_photos   385

In [5]:
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OGradientBoostingEstimator

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.19" 2023-04-18; OpenJDK Runtime Environment (build 11.0.19+7-post-Ubuntu-0ubuntu120.04.1); OpenJDK 64-Bit Server VM (build 11.0.19+7-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp1eud5fwx
  JVM stdout: /tmp/tmp1eud5fwx/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp1eud5fwx/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,08 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_unknownUser_ecvnox
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


#### Partition Data

In [6]:
h2o_cars = h2o.H2OFrame(cars)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [7]:
train, valid = h2o_cars.split_frame(ratios=[0.8], seed=42)

#### Train

In [8]:
label = 'price_usd'
train_features = ['odometer_value', 'engine_has_gas', 'is_exchangeable', 'is_exchangeable', 'up_counter']

In [9]:
aml = H2OAutoML(max_runtime_secs=60)
aml.train(x=train_features, y=label, training_frame= train, validation_frame=valid)

AutoML progress: |
12:13:16.730: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),2/3
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,
Custom metalearner hyperparameters,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,5077.6235,76.126045,5191.2427,5042.9414,5113.257,4995.482,5045.1953
mean_residual_deviance,55276516.0,1714241.0,58100368.0,54906992.0,55300484.0,53502460.0,54572276.0
mse,55276516.0,1714241.0,58100368.0,54906992.0,55300484.0,53502460.0,54572276.0
null_deviance,504889836000.0,21017124900.0,539592884000.0,507076608000.0,496451650000.0,497061003000.0,484266967000.0
r2,0.3223279,0.0122739,0.3226199,0.3344267,0.3052879,0.3334547,0.3158505
residual_deviance,341994930000.0,14231007200.0,365393215000.0,337293640000.0,344853840000.0,331180212000.0,331253711000.0
rmse,7434.1113,114.5824,7622.36,7409.9253,7436.4297,7314.5376,7387.305
rmsle,0.947441,0.0162435,0.9581943,0.9237533,0.9651355,0.9398535,0.9502685


#### Evaluate

In [10]:
# Get the best performing model
best_model = aml.leader

# Evaluate model performance on validation set
perf = best_model.model_performance(valid)
print(perf)

ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 53312499.909136094
RMSE: 7301.540927032875
MAE: 5043.295096193041
RMSLE: 0.9467949961942497
Mean Residual Deviance: 53312499.909136094
R^2: 0.32020241014180884
Null degrees of freedom: 7602
Residual degrees of freedom: 7600
Null deviance: 596393314535.6222
Residual deviance: 405334936809.16174
AIC: 156854.5328795445


In [11]:
leaderboard = aml.leaderboard
print (leaderboard)

model_id                                                    rmse          mse      mae       rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_1_AutoML_1_20230610_121316  7437.07  5.531e+07    5077.9     0.946639               5.531e+07
StackedEnsemble_BestOfFamily_2_AutoML_1_20230610_121316  7454.89  5.55754e+07  5096.56    0.958698               5.55754e+07
GBM_1_AutoML_1_20230610_121316                           7485.57  5.60337e+07  5105.81    0.956254               5.60337e+07
XGBoost_2_AutoML_1_20230610_121316                       7572.66  5.73452e+07  5004.93    0.928248               5.73452e+07
XGBoost_1_AutoML_1_20230610_121316                       7584.99  5.7532e+07   5166.76  nan                      5.7532e+07
DRF_1_AutoML_1_20230610_121316                           8292.04  6.87579e+07  5835.89    1.06335                6.87579e+07
GBM_2_AutoML_1_20230610_121316                           8466.59  7.16832e+07  5980.8     1.07628                7.16832e+07
GBM

In [12]:
# Print all rows instead of 10 rows
leaderboard.head(rows = leaderboard.nrows)

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_1_AutoML_1_20230610_121316,7437.07,55310000.0,5077.9,0.946639,55310000.0
StackedEnsemble_BestOfFamily_2_AutoML_1_20230610_121316,7454.89,55575400.0,5096.56,0.958698,55575400.0
GBM_1_AutoML_1_20230610_121316,7485.57,56033700.0,5105.81,0.956254,56033700.0
XGBoost_2_AutoML_1_20230610_121316,7572.66,57345200.0,5004.93,0.928248,57345200.0
XGBoost_1_AutoML_1_20230610_121316,7584.99,57532000.0,5166.76,,57532000.0
DRF_1_AutoML_1_20230610_121316,8292.04,68757900.0,5835.89,1.06335,68757900.0
GBM_2_AutoML_1_20230610_121316,8466.59,71683200.0,5980.8,1.07628,71683200.0
GBM_3_AutoML_1_20230610_121316,8473.15,71794300.0,5995.81,1.07738,71794300.0
GBM_4_AutoML_1_20230610_121316,8690.14,75518600.0,6178.92,1.09468,75518600.0
GLM_1_AutoML_1_20230610_121316,9034.13,81615500.0,6466.51,1.12352,81615500.0


## Classification

#### Load Data

In [13]:
titanic = pd.read_csv('titanic.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [14]:
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OGradientBoostingEstimator

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 min 30 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_unknownUser_ecvnox
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.162 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [15]:
# For classification, the label column data type must be categorical
# we convert to categorical (enum) using the asfactor function

#titanic['Survived'] = pd.factorize(titanic['Survived'])[0]
#titanic['Survived'] = titanic.Survived.astype('category')
titanic['Survived'] = titanic.Survived.astype('str')
titanic.dtypes

PassengerId      int64
Survived        object
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [16]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
h2o_titanic = h2o.H2OFrame(titanic)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [18]:
train, valid = h2o_titanic.split_frame(ratios=[0.8], seed=42)

In [19]:
label = 'Survived'
train_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [20]:
# If the label column is not categorical type but the values in the column seems to be for classification,
# the number of values in the label column, you might get such a warning:
# # Detected that your response column has only 2 unique values (0/1). 
# # If you wish to train a binary model instead of a regression model, 
# # convert your target column to categorical before training.

aml_classification = H2OAutoML(max_runtime_secs=60)
aml_classification.train(x=train_features, y=label, training_frame= train, validation_frame=valid)

AutoML progress: |
12:14:20.125: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
12:14:20.133: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


12:14:21.380: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

█
12:14:22.93: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

█
12:14:

key,value
Stacking strategy,cross_validation
Number of base models (used / total),6/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DRF base models (used / total),2/2
# GLM base models (used / total),1/1
# DeepLearning base models (used / total),1/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.2700015,0.0231035,0.2754715,0.2866729,0.2558545,0.237763,0.2942456
mean_residual_deviance,0.1305552,0.0159085,0.1296042,0.1478337,0.1138704,0.1159958,0.1454721
mse,0.1305552,0.0159085,0.1296042,0.1478337,0.1138704,0.1159958,0.1454721
null_deviance,33.133068,1.5502192,32.313572,34.95398,31.935497,31.786848,34.67545
r2,0.4441439,0.0598427,0.4420494,0.369805,0.5071483,0.4995113,0.4022054
residual_deviance,18.431862,2.7957776,18.014982,22.027218,15.714112,15.891431,20.511562
rmse,0.3607868,0.022027,0.3600058,0.3844915,0.3374469,0.3405816,0.381408
rmsle,0.2549147,0.0145961,0.2558025,0.2732077,0.239399,0.2414781,0.2646865


In [21]:
# Get the best performing model
best_model = aml_classification.leader

# Evaluate model performance on validation set
perf = best_model.model_performance(valid)
print(perf)

ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 0.11408902363625167
RMSE: 0.337770667222972
MAE: 0.24190924122179275
RMSLE: 0.23795309182452695
Mean Residual Deviance: 0.11408902363625167
R^2: 0.5270769241896534
Null degrees of freedom: 186
Residual degrees of freedom: 180
Null deviance: 45.2650035511364
Residual deviance: 21.334647419979063
AIC: 140.74785711006442
