In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

In [None]:
train.head()

In [None]:
del train["id"]
del train["Unnamed: 32"]

In [None]:
len(train.columns)

In [None]:
X = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

In [None]:
train.describe()

In [None]:
#Removing rows with null values
train = train.dropna()
train.describe()

In [None]:
train.info()

In [None]:
train.columns

In [None]:
import h2o
h2o.init()

In [None]:
"""
Converting Pandas DF to H2O Frame
"""
hf = h2o.H2OFrame(train)

In [None]:
hf.col_names[1:31]

In [None]:
"""
Isolation Forest
Model building and predicting for UnSupervised data by taking a slice of only independent features. 
parameters :
    seed-declare a random seed value.
    ntrees-no. of random sampling trees to create a particular observation per leaf.
    x-independent feature names.
    training_frame-training data.
"""
seed = 9933
ntrees = 333
isoforest = h2o.estimators.H2OIsolationForestEstimator(ntrees=ntrees, seed=seed, max_depth=33)
isoforest.train(x=hf.col_names[1:31], training_frame=hf)
predictions = isoforest.predict(hf)
predictions

In [None]:
predictions.shape

In [None]:
predictions["mean_length"].hist()

In [None]:
"""
Interpreting predictions
"""
predictions.cor()

In [None]:
anomalies = hf[predictions["mean_length"] < 8]
print("Number of Anomalies: " + str(anomalies.nrow))

In [None]:
anomalies

In [None]:
isoforest.predict(anomalies)["mean_length"].cbind(anomalies[X])

In [None]:
global_data = hf[:, :]
global_data["anomaly"] = (predictions["mean_length"] < 8).ifelse("Yes", "No")
global_data["anomaly"].table()

In [None]:
from h2o.estimators import H2ORandomForestEstimator
global_dt = H2ORandomForestEstimator(model_id = "decision_tree.hex", 
                                               ntrees = 1, max_depth = 3,
                                               sample_rate = 1, mtries = len(X))
global_dt.train(training_frame = global_data, x = X, y = "anomaly")

In [None]:
import os
import subprocess
from IPython.display import Image
def generateTreeImage(decision_tree, image_file_path):
    # Download MOJO
    mojo_path = decision_tree.download_mojo(get_genmodel_jar=True)
    directory = os.path.dirname(mojo_path)
    h2o_jar_path = os.path.join(directory, "h2o-genmodel.jar")
    # Create Graphviz file
    gv_file_path = os.path.join(directory, "decision_tree.gv")
    gv_call = " ".join(["java", "-cp", h2o_jar_path, "hex.genmodel.tools.PrintMojo", "--tree 0 -i", mojo_path , "-o", gv_file_path])
    result = subprocess.call(gv_call, shell=True)
    result = subprocess.call(["ls", gv_file_path], shell = False)
    result = subprocess.call(["dot", "-Tpng", gv_file_path, "-o", image_file_path], shell=False)
    result = subprocess.call(["ls",image_file_path], shell = False)
    
    return Image(image_file_path)

In [None]:
generateTreeImage(global_dt, "/kaggle/working/1.png")