In [1]:
""" To run, please install skicit-multiflow, and Cython
    Commands to do this are as follows:
        - pip install Cython
        - pip install scikit-multiflow
"""
!pip install Cython --quiet
!pip install scikit-multiflow --quiet

import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from skmultiflow.data import SEAGenerator
from skmultiflow.trees import HoeffdingTreeClassifier
from sklearn.model_selection import train_test_split

def neg_pos(data):
    if data > 0:
        return 1
    else: 
        return 0

trainDf = pd.read_csv('/content/aps_failure_training_set_processed_8bit.csv')
testDf = pd.read_csv('/content/aps_failure_test_set_processed_8bit.csv')

trainDf['class'] = trainDf['class'].apply(neg_pos) # labels changed to either 0 or 1 based on its parity 
testDf['class'] = testDf['class'].apply(neg_pos)

X = trainDf.drop('class', axis = 1)
y = trainDf['class']

X_test = testDf.drop('class', axis = 1)
y_test = testDf['class']

stream = SEAGenerator(random_state=1)
ht = HoeffdingTreeClassifier(leaf_prediction = 'nb')

curr_samples = 0
tp = 0
fp = 0
max_samples = 10000

while curr_samples < max_samples and stream.has_more_samples():
    X_test, y_test = stream.next_sample()
    y_pred = ht.predict(X_test)
    if y_test[0] == y_pred[0]:
        tp += 1
    else:
        fp += 1
    ht = ht.partial_fit(X_test, y_test)
    curr_samples += 1
    print(ht.measure_tree_depth())

print('{} samples analyzed.'.format(curr_samples))
print('Hoeffding Tree accuracy: {}'.format(tp / curr_samples))
print(fp)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3

In [11]:
(round(1 - (0.9616), 3)) * 100

3.8

In [3]:
ht.get_params(deep = True)

{'binary_split': False,
 'grace_period': 200,
 'leaf_prediction': 'nb',
 'max_byte_size': 33554432,
 'memory_estimate_period': 1000000,
 'nb_threshold': 0,
 'no_preprune': False,
 'nominal_attributes': None,
 'remove_poor_atts': False,
 'split_confidence': 1e-07,
 'split_criterion': 'info_gain',
 'stop_mem_management': False,
 'tie_threshold': 0.05}

In [4]:
ht.measure_tree_depth()

3

In [5]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1]])

In [6]:
print(ht.get_model_description())

if Attribute 0 <= 4.549969620513424:
  if Attribute 1 <= 5.440182925299016:
    Leaf = Class 0 | {0: 1483.7701231655974, 1: 126.21043791003063}
  if Attribute 1 > 5.440182925299016:
    Leaf = Class 1 | {0: 226.2298768344026, 1: 1134.7895620899694}
if Attribute 0 > 4.549969620513424:
  if Attribute 1 <= 2.7458918258475:
    if Attribute 0 <= 7.025831689338791:
      Leaf = Class 0 | {0: 346.9957515289482, 1: 80.24731419124495}
    if Attribute 0 > 7.025831689338791:
      Leaf = Class 1 | {0: 22.00424847105179, 1: 462.752685808755}
  if Attribute 1 > 2.7458918258475:
    Leaf = Class 1 | {0: 19.802776649733953, 1: 2649.539139305971}



In [7]:
ht.get_model_measurements

{'Active leaf byte size estimate': 0.0,
 'Active learning nodes': 5,
 'Byte size estimate overhead': 1.0,
 'Inactive leaf byte size estimate': 0.0,
 'Tree depth': 3,
 'Tree size (leaves)': 5,
 'Tree size (nodes)': 9}

In [8]:
ht.get_model_rules()

[Rule(class_distribution=None, class_idx=0, drift_detector=None),
 Rule(class_distribution=None, class_idx=1, drift_detector=None),
 Rule(class_distribution=None, class_idx=0, drift_detector=None),
 Rule(class_distribution=None, class_idx=1, drift_detector=None),
 Rule(class_distribution=None, class_idx=1, drift_detector=None)]

In [9]:
ht.get_info()

"HoeffdingTreeClassifier(binary_split=False, grace_period=200,\n                        leaf_prediction='nb', max_byte_size=33554432,\n                        memory_estimate_period=1000000, nb_threshold=0,\n                        no_preprune=False, nominal_attributes=None,\n                        remove_poor_atts=False, split_confidence=1e-07,\n                        split_criterion='info_gain', stop_mem_management=False,\n                        tie_threshold=0.05)"

In [10]:
ht.new_split_node(X_test, X)

<skmultiflow.trees.nodes.split_node.SplitNode at 0x7f13d5bf4190>