In [1]:
from __future__ import print_function
import sklearn
import sklearn.datasets
import sklearn.ensemble
import pandas as pd
import numpy as np
import lime
import lime.lime_tabular
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

np.random.seed(1)

In [2]:
# Start an H2O virtual cluster that uses 6 gigs of RAM and 6 cores
h2o.init(max_mem_size = "500M", nthreads = 6) 

# Clean up the cluster just in case
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.6+8-LTS, mixed mode)
  Starting server from E:\Anaconda\envs\project1\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ASUSA4~1\AppData\Local\Temp\tmp3ce66rb3
  JVM stdout: C:\Users\ASUSA4~1\AppData\Local\Temp\tmp3ce66rb3\h2o_Rizky_Saputra_started_from_python.out
  JVM stderr: C:\Users\ASUSA4~1\AppData\Local\Temp\tmp3ce66rb3\h2o_Rizky_Saputra_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Asia/Bangkok
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,1 month
H2O_cluster_name:,H2O_from_python_Rizky_Saputra_u3exb0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,500 Mb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [33]:
class h2o_predict_proba_wrapper:
    # drf is the h2o distributed random forest object, the column_names is the
    # labels of the X values
    def __init__(self,model,column_names):
            
            self.model = model
            self.column_names = column_names
 
    def predict_proba(self,this_array):        
        # If we have just 1 row of data we need to reshape it
        shape_tuple = np.shape(this_array)        
        if len(shape_tuple) == 1:
            this_array = this_array.reshape(1, -1)
            
        # We convert the numpy array that Lime sends to a pandas dataframe and
        # convert the pandas dataframe to an h2o frame
        self.pandas_df = pd.DataFrame(data = this_array,columns = self.column_names)
        self.h2o_df = h2o.H2OFrame(self.pandas_df)
        
        # Predict with the h2o drf
        self.predictions = self.model.predict(self.h2o_df).as_data_frame()
        # the first column is the class labels, the rest are probabilities for
        # each class
        self.predictions = self.predictions.iloc[:,1:].as_data_frame()
        return self.predictions

In [34]:
iris = sklearn.datasets.load_iris()

# Get the text names for the features and the class labels
feature_names = iris.feature_names
class_labels = 'species'

In [35]:
# Generate a train test split and convert to pandas and h2o frames

train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(iris.data, iris.target, train_size=0.80)

train_h2o_df = h2o.H2OFrame(train)
train_h2o_df.set_names(iris.feature_names)
train_h2o_df['species'] = h2o.H2OFrame(iris.target_names[labels_train])
train_h2o_df['species'] = train_h2o_df['species'].asfactor()

test_h2o_df = h2o.H2OFrame(test)
test_h2o_df.set_names(iris.feature_names)
test_h2o_df['species'] = h2o.H2OFrame(iris.target_names[labels_test])
test_h2o_df['species'] = test_h2o_df['species'].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [36]:
# rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
# rf.fit(train, labels_train)

iris_drf = H2ORandomForestEstimator(
        model_id="iris_drf",
        ntrees=500,
        stopping_rounds=2,
        score_each_iteration=True,
        seed=1000000,
        balance_classes=False,
        histogram_type="AUTO")

iris_drf.train(x=feature_names,
         y='species',
         training_frame=train_h2o_df)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [37]:
# sklearn.metrics.accuracy_score(labels_test, rf.predict(test))

iris_drf.model_performance(test_h2o_df)


ModelMetricsMultinomial: drf
** Reported on test data. **

MSE: 0.05142958262573867
RMSE: 0.22678091327476982
LogLoss: 1.2099415382279914
Mean Per-Class Error: 0.07692307692307693

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,setosa,versicolor,virginica,Error,Rate
0,11.0,0.0,0.0,0.0,0 / 11
1,0.0,6.0,0.0,0.0,0 / 6
2,0.0,3.0,10.0,0.230769,3 / 13
3,11.0,9.0,10.0,0.1,3 / 30



Top-3 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.9
1,2,1.0
2,3,1.0




In [38]:
train_pandas_df = train_h2o_df[feature_names].as_data_frame() 
train_numpy_array = train_pandas_df.to_numpy() 

test_pandas_df = test_h2o_df[feature_names].as_data_frame() 
test_numpy_array = test_pandas_df.to_numpy() 

In [39]:
#explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True)

explainer = lime.lime_tabular.LimeTabularExplainer(train_numpy_array,
                                                   feature_names=feature_names,
                                                   class_names=iris.target_names,
                                                   discretize_continuous=True)

In [40]:
h2o_drf_wrapper = h2o_predict_proba_wrapper(iris_drf,feature_names) 

In [41]:
# i = np.random.randint(0, test_numpy_array.shape[0])

i = 27
# exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1)

exp = explainer.explain_instance(test_numpy_array[i], h2o_drf_wrapper.predict_proba, num_features=2, top_labels=1)

Parse progress: |█████████████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%


AttributeError: 'DataFrame' object has no attribute 'as_data_frame'