Skip to content

Commit

Permalink
Add streamline regression attribute observer
Browse files Browse the repository at this point in the history
  • Loading branch information
smastelini committed Apr 22, 2020
1 parent e4ac207 commit b3ceddd
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 295 deletions.
4 changes: 1 addition & 3 deletions src/skmultiflow/trees/attribute_observer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from .numeric_attribute_class_observer_binary_tree import NumericAttributeClassObserverBinaryTree
from .numeric_attribute_class_observer_gaussian import NumericAttributeClassObserverGaussian
from .numeric_attribute_regression_observer import NumericAttributeRegressionObserver
from .numeric_attribute_regression_observer_multi_target import NumericAttributeRegressionObserverMultiTarget

__all__ = ["AttributeClassObserver", "AttributeClassObserverNull", "NominalAttributeClassObserver",
"NominalAttributeRegressionObserver", "NumericAttributeClassObserverBinaryTree",
"NumericAttributeClassObserverGaussian", "NumericAttributeRegressionObserver",
"NumericAttributeRegressionObserverMultiTarget"]
"NumericAttributeClassObserverGaussian", "NumericAttributeRegressionObserver"]
Original file line number Diff line number Diff line change
@@ -1,182 +1,231 @@
from skmultiflow.trees.attribute_observer import NumericAttributeClassObserverBinaryTree
import math
import numpy as np
from skmultiflow.trees.attribute_observer import AttributeClassObserver
from skmultiflow.trees.attribute_test import NumericAttributeBinaryTest
from skmultiflow.trees.attribute_split_suggestion import AttributeSplitSuggestion


class NumericAttributeRegressionObserver(NumericAttributeClassObserverBinaryTree):
""" Class for observing the data distribution for a numeric attribute for regression.
class NumericAttributeRegressionObserver(AttributeClassObserver):
"""iSoup-Tree's Extended Binary Search Tree (E-BST)
This class implements the Extended Binary Search Tree (E-BST)
structure, using the variant employed by Osojnik et al. [1]_ in the
iSOUP-Tree algorithm. This structure is employed to observe the target
space distribution.
In this variant, only the left branch statistics are stored.
References
----------
.. [1] Osojnik, Aljaž. 2017. Structured output prediction on Data
Streams (Doctoral Dissertation). Retrieved from:
http://kt.ijs.si/theses/phd_aljaz_osojnik.pdf
"""

class Node:
def __init__(self, att_val, target, weight):
self.att_val = att_val

self.sum_weight = weight
self.sum_target = weight * target
self.sum_sq_target = weight * target * target

def __init__(self, val, label):
self._left_statistics = {}
self._right_statistics = {}
self._left = None
self._right = None
self._cut_point = val
try:
self._left_statistics[0] += 1
self._left_statistics[1] += label
self._left_statistics[2] += label * label
except KeyError:
self._left_statistics[0] = 1
self._left_statistics[1] = label
self._left_statistics[2] = label * label
"""
/**
* Insert a new value into the tree, updating both the sum of values and
* sum of squared values arrays
*/
"""

def insert_value(self, val, label):
if val == self._cut_point:

self._left_statistics[0] += 1
self._left_statistics[1] += label
self._left_statistics[2] += label * label

elif val < self._cut_point:

self._left_statistics[0] += 1
self._left_statistics[1] += label
self._left_statistics[2] += label * label
if self._left is None:
self._left = NumericAttributeRegressionObserver.Node(val, label)

# Incremental implementation of the insert method. Avoiding unecessary
# stack tracing must decrease memory costs
def insert_value(self, att_val, target, weight):
current = self
antecedent = None

while current is not None:
antecedent = current
if math.isclose(att_val, current.att_val):
current.sum_weight += weight
current.sum_target += weight * target
current.sum_sq_target += weight * target * target
return
elif att_val < current.att_val:
current.sum_weight += weight
current.sum_target += weight * target
current.sum_sq_target += weight * target * target

current = current._left
is_right = False
else:
self._left.insert_value(val, label)
current = current._right
is_right = True

# Value was not yet added to the tree
if is_right:
antecedent._right = NumericAttributeRegressionObserver.Node(
att_val, target, weight
)
else:
try:
self._right_statistics[0] += 1
self._right_statistics[1] += label
self._right_statistics[2] += label * label
except KeyError:
self._right_statistics[0] = 1
self._right_statistics[1] = label
self._right_statistics[2] = label * label

if self._right is None:
self._right = NumericAttributeRegressionObserver.Node(val, label)
else:
self._right.insert_value(val, label)

"""
end of class Node
"""
antecedent._left = NumericAttributeRegressionObserver.Node(
att_val, target, weight
)

def __init__(self):
super().__init__()
self._root = None
self._sum_total_left = 0.0
self._sum_total_right = 0.0
self._sum_sq_total_left = 0.0
self._sum_sq_total_right = 0.0
self._count_right_total = 0.0
self._count_left_total = 0.0

def observe_attribute_class(self, att_val, class_val, weight):

if att_val is None:
return

else:
if self._root is None:
self._root = NumericAttributeRegressionObserver.Node(att_val, class_val)
self._root = NumericAttributeRegressionObserver.Node(att_val, class_val, weight)
else:
self._root.insert_value(att_val, class_val)
self._root.insert_value(att_val, class_val, weight)

def probability_of_attribute_value_given_class(self, att_val, class_val):
return 0.0

def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only):
raise NotImplementedError

self._sum_total_left = 0.0
self._sum_total_right = pre_split_dist[1]
self._sum_sq_total_left = 0.0
self._sum_sq_total_right = pre_split_dist[2]
self._count_left_total = 0.0
self._count_right_total = pre_split_dist[0]
return self.search_for_best_split_option(self._root, None, criterion, att_idx)
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
att_idx, binary_only=True):
self._criterion = criterion
self._pre_split_dist = pre_split_dist
self._att_idx = att_idx

def search_for_best_split_option(self, current_node, current_best_option, criterion, att_idx):
if current_node is None or self._count_right_total == 0:
return current_best_option
if current_node._left is not None:
current_best_option = self.search_for_best_split_option(current_node._left, current_best_option,
criterion, att_idx)
self._sum_total_left += current_node._left_statistics[1]
self._sum_total_right -= current_node._left_statistics[1]
self._sum_sq_total_left += current_node._left_statistics[2]
self._sum_sq_total_right -= current_node._left_statistics[2]
self._count_right_total -= current_node._left_statistics[0]
self._count_left_total += current_node._left_statistics[0]

lhs_dist = {}
rhs_dist = {}
lhs_dist[0] = self._count_left_total
lhs_dist[1] = self._sum_total_left
lhs_dist[2] = self._sum_sq_total_left
rhs_dist[0] = self._count_right_total
rhs_dist[1] = self._sum_total_right
rhs_dist[2] = self._sum_sq_total_right
post_split_dists = [lhs_dist, rhs_dist]
pre_split_dist = [(self._count_left_total + self._count_right_total),
(self._sum_total_left + self._sum_total_right),
(self._sum_sq_total_left + self._sum_sq_total_right)]

merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)

if current_best_option is None or merit > current_best_option.merit:
num_att_binary_test = NumericAttributeBinaryTest(att_idx, current_node._cut_point, True)
current_best_option = AttributeSplitSuggestion(num_att_binary_test, post_split_dists, merit)

if current_node._right is not None:
current_best_option = self.search_for_best_split_option(current_node._right, current_best_option, criterion,
att_idx)

self._sum_total_left -= current_node._left_statistics.get(1)
self._sum_total_right += current_node._left_statistics.get(1)
self._sum_sq_total_left -= current_node._left_statistics.get(2)
self._sum_sq_total_right += current_node._left_statistics.get(2)
self._count_left_total -= current_node._left_statistics.get(0)
self._count_right_total += current_node._left_statistics.get(0)

return current_best_option

def remove_bad_splits(self, criterion, last_check_ratio, last_check_sdr, last_check_e):

self.remove_bad_split_nodes(criterion, self._root, last_check_ratio, last_check_sdr, last_check_e)

def remove_bad_split_nodes(self, criterion, current_node, last_check_ratio, last_check_sdr, last_check_e):
self._aux_sum_weight = 0

# Handles both single-target and multi-target tasks
if np.ndim(pre_split_dist[1]) == 0:
self._aux_sum = 0.0
self._aux_sum_sq = 0.0
else:
self._aux_sum = np.zeros_like(pre_split_dist[1])
self._aux_sum_sq = np.zeros_like(pre_split_dist[2])

candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))

best_split = self._find_best_split(self._root, candidate)

# Reset auxiliary variables
self._criterion = None
self._pre_split_dist = None
self._att_idx = None
self._aux_sum_weight = None
self._aux_sum = None
self._aux_sum_sq = None

return best_split

def _find_best_split(self, node, candidate):
if node._left is not None:
candidate = self._find_best_split(node._left, candidate)
# Left post split distribution
left_dist = {}
left_dist[0] = node.sum_weight + self._aux_sum_weight
left_dist[1] = node.sum_target + self._aux_sum
left_dist[2] = node.sum_sq_target + self._aux_sum_sq

# The right split distribution is calculated as the difference
# between the total distribution (pre split distribution) and
# the left distribution
right_dist = {}
right_dist[0] = self._pre_split_dist[0] - left_dist[0]
right_dist[1] = self._pre_split_dist[1] - left_dist[1]
right_dist[2] = self._pre_split_dist[2] - left_dist[2]

post_split_dists = [left_dist, right_dist]

merit = self._criterion.get_merit_of_split(self._pre_split_dist,
post_split_dists)
if merit > candidate.merit:
num_att_binary_test = NumericAttributeBinaryTest(self._att_idx,
node.att_val,
True)
candidate = AttributeSplitSuggestion(num_att_binary_test,
post_split_dists, merit)

if node._right is not None:
self._aux_sum_weight += node.sum_weight
self._aux_sum += node.sum_target
self._aux_sum_sq += node.sum_sq_target

right_candidate = self._find_best_split(node._right, candidate)

if right_candidate.merit > candidate.merit:
candidate = right_candidate

self._aux_sum_weight -= node.sum_weight
self._aux_sum -= node.sum_target
self._aux_sum_sq -= node.sum_sq_target

return candidate

def remove_bad_splits(self, criterion, last_check_ratio, last_check_sdr, last_check_e,
pre_split_dist):

# Auxiliary variables
self._criterion = criterion
self._pre_split_dist = pre_split_dist
self._last_check_ratio = last_check_ratio
self._last_check_sdr = last_check_sdr
self._last_check_e = last_check_e

self._aux_sum_weight = 0

# Encompass both the single-target and multi-target cases
if np.ndim(pre_split_dist[1]) == 0:
self._aux_sum = 0.0
self._aux_sum_sq = 0.0
else:
self._aux_sum = np.zeros_like(pre_split_dist[1])
self._aux_sum_sq = np.zeros_like(pre_split_dist[2])

self._remove_bad_split_nodes(self._root)

# Reset auxiliary variables
self._criterion = None
self._pre_split_dist = None
self._last_check_ratio = None
self._last_check_sdr = None
self._last_check_e = None
self._aux_sum_weight = None
self._aux_sum = None
self._aux_sum_sq = None

def _remove_bad_split_nodes(self, current_node):
is_bad = False

if current_node is None:
return True

if current_node._left is not None:
is_bad = self.remove_bad_split_nodes(criterion, current_node._left, last_check_ratio,
last_check_sdr, last_check_e)
is_bad = self._remove_bad_split_nodes(current_node._left)

if is_bad:
current_node._left = None

if current_node._right is not None and is_bad:
is_bad = self.remove_bad_split_nodes(criterion, current_node._left, last_check_ratio,
last_check_sdr, last_check_e)
is_bad = self._remove_bad_split_nodes(current_node._right)

if is_bad:
current_node._right = None

if is_bad:
left_stat = [current_node._left_statistics[0], current_node._left_statistics[1],
current_node._left_statistics[2]]
right_stat = [current_node._right_statistics[0], current_node._right_statistics[1],
current_node._right_statistics[2]]

post_split_dists = [left_stat, right_stat]
pre_split_dist = [(current_node._left_statistics.get(0) + current_node._right_statistics.get(0)),
(current_node._left_statistics.get(1) + current_node._right_statistics.get(1)),
(current_node._left_statistics.get(2) + current_node._right_statistics.get(2))]
merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)
if (merit / last_check_sdr) < (last_check_ratio - (2 * last_check_e)):
current_node = None
# Left post split distribution
left_dist = {}
left_dist[0] = current_node.sum_weight + self._aux_sum_weight
left_dist[1] = current_node.sum_target + self._aux_sum
left_dist[2] = current_node.sum_sq_target + self._aux_sum_sq

# The right split distribution is calculated as the difference
# between the total distribution (pre split distribution) and
# the left distribution
right_dist = {}
right_dist[0] = self._pre_split_dist[0] - left_dist[0]
right_dist[1] = self._pre_split_dist[1] - left_dist[1]
right_dist[2] = self._pre_split_dist[2] - left_dist[2]

post_split_dists = [left_dist, right_dist]
merit = self._criterion.get_merit_of_split(self._pre_split_dist, post_split_dists)
if (merit / self._last_check_sdr) < (self._last_check_ratio -
(2 * self._last_check_e)):
return True

return False

0 comments on commit b3ceddd

Please sign in to comment.