-
Notifications
You must be signed in to change notification settings - Fork 182
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add streamline regression attribute observer
- Loading branch information
1 parent
e4ac207
commit b3ceddd
Showing
6 changed files
with
198 additions
and
295 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
333 changes: 191 additions & 142 deletions
333
src/skmultiflow/trees/attribute_observer/numeric_attribute_regression_observer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,182 +1,231 @@ | ||
from skmultiflow.trees.attribute_observer import NumericAttributeClassObserverBinaryTree | ||
import math | ||
import numpy as np | ||
from skmultiflow.trees.attribute_observer import AttributeClassObserver | ||
from skmultiflow.trees.attribute_test import NumericAttributeBinaryTest | ||
from skmultiflow.trees.attribute_split_suggestion import AttributeSplitSuggestion | ||
|
||
|
||
class NumericAttributeRegressionObserver(NumericAttributeClassObserverBinaryTree): | ||
""" Class for observing the data distribution for a numeric attribute for regression. | ||
class NumericAttributeRegressionObserver(AttributeClassObserver): | ||
"""iSoup-Tree's Extended Binary Search Tree (E-BST) | ||
This class implements the Extended Binary Search Tree (E-BST) | ||
structure, using the variant employed by Osojnik et al. [1]_ in the | ||
iSOUP-Tree algorithm. This structure is employed to observe the target | ||
space distribution. | ||
In this variant, only the left branch statistics are stored. | ||
References | ||
---------- | ||
.. [1] Osojnik, Aljaž. 2017. Structured output prediction on Data | ||
Streams (Doctoral Dissertation). Retrieved from: | ||
http://kt.ijs.si/theses/phd_aljaz_osojnik.pdf | ||
""" | ||
|
||
class Node: | ||
def __init__(self, att_val, target, weight): | ||
self.att_val = att_val | ||
|
||
self.sum_weight = weight | ||
self.sum_target = weight * target | ||
self.sum_sq_target = weight * target * target | ||
|
||
def __init__(self, val, label): | ||
self._left_statistics = {} | ||
self._right_statistics = {} | ||
self._left = None | ||
self._right = None | ||
self._cut_point = val | ||
try: | ||
self._left_statistics[0] += 1 | ||
self._left_statistics[1] += label | ||
self._left_statistics[2] += label * label | ||
except KeyError: | ||
self._left_statistics[0] = 1 | ||
self._left_statistics[1] = label | ||
self._left_statistics[2] = label * label | ||
""" | ||
/** | ||
* Insert a new value into the tree, updating both the sum of values and | ||
* sum of squared values arrays | ||
*/ | ||
""" | ||
|
||
def insert_value(self, val, label): | ||
if val == self._cut_point: | ||
|
||
self._left_statistics[0] += 1 | ||
self._left_statistics[1] += label | ||
self._left_statistics[2] += label * label | ||
|
||
elif val < self._cut_point: | ||
|
||
self._left_statistics[0] += 1 | ||
self._left_statistics[1] += label | ||
self._left_statistics[2] += label * label | ||
if self._left is None: | ||
self._left = NumericAttributeRegressionObserver.Node(val, label) | ||
|
||
# Incremental implementation of the insert method. Avoiding unecessary | ||
# stack tracing must decrease memory costs | ||
def insert_value(self, att_val, target, weight): | ||
current = self | ||
antecedent = None | ||
|
||
while current is not None: | ||
antecedent = current | ||
if math.isclose(att_val, current.att_val): | ||
current.sum_weight += weight | ||
current.sum_target += weight * target | ||
current.sum_sq_target += weight * target * target | ||
return | ||
elif att_val < current.att_val: | ||
current.sum_weight += weight | ||
current.sum_target += weight * target | ||
current.sum_sq_target += weight * target * target | ||
|
||
current = current._left | ||
is_right = False | ||
else: | ||
self._left.insert_value(val, label) | ||
current = current._right | ||
is_right = True | ||
|
||
# Value was not yet added to the tree | ||
if is_right: | ||
antecedent._right = NumericAttributeRegressionObserver.Node( | ||
att_val, target, weight | ||
) | ||
else: | ||
try: | ||
self._right_statistics[0] += 1 | ||
self._right_statistics[1] += label | ||
self._right_statistics[2] += label * label | ||
except KeyError: | ||
self._right_statistics[0] = 1 | ||
self._right_statistics[1] = label | ||
self._right_statistics[2] = label * label | ||
|
||
if self._right is None: | ||
self._right = NumericAttributeRegressionObserver.Node(val, label) | ||
else: | ||
self._right.insert_value(val, label) | ||
|
||
""" | ||
end of class Node | ||
""" | ||
antecedent._left = NumericAttributeRegressionObserver.Node( | ||
att_val, target, weight | ||
) | ||
|
||
def __init__(self): | ||
super().__init__() | ||
self._root = None | ||
self._sum_total_left = 0.0 | ||
self._sum_total_right = 0.0 | ||
self._sum_sq_total_left = 0.0 | ||
self._sum_sq_total_right = 0.0 | ||
self._count_right_total = 0.0 | ||
self._count_left_total = 0.0 | ||
|
||
def observe_attribute_class(self, att_val, class_val, weight): | ||
|
||
if att_val is None: | ||
return | ||
|
||
else: | ||
if self._root is None: | ||
self._root = NumericAttributeRegressionObserver.Node(att_val, class_val) | ||
self._root = NumericAttributeRegressionObserver.Node(att_val, class_val, weight) | ||
else: | ||
self._root.insert_value(att_val, class_val) | ||
self._root.insert_value(att_val, class_val, weight) | ||
|
||
def probability_of_attribute_value_given_class(self, att_val, class_val): | ||
return 0.0 | ||
|
||
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only): | ||
raise NotImplementedError | ||
|
||
self._sum_total_left = 0.0 | ||
self._sum_total_right = pre_split_dist[1] | ||
self._sum_sq_total_left = 0.0 | ||
self._sum_sq_total_right = pre_split_dist[2] | ||
self._count_left_total = 0.0 | ||
self._count_right_total = pre_split_dist[0] | ||
return self.search_for_best_split_option(self._root, None, criterion, att_idx) | ||
def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, | ||
att_idx, binary_only=True): | ||
self._criterion = criterion | ||
self._pre_split_dist = pre_split_dist | ||
self._att_idx = att_idx | ||
|
||
def search_for_best_split_option(self, current_node, current_best_option, criterion, att_idx): | ||
if current_node is None or self._count_right_total == 0: | ||
return current_best_option | ||
if current_node._left is not None: | ||
current_best_option = self.search_for_best_split_option(current_node._left, current_best_option, | ||
criterion, att_idx) | ||
self._sum_total_left += current_node._left_statistics[1] | ||
self._sum_total_right -= current_node._left_statistics[1] | ||
self._sum_sq_total_left += current_node._left_statistics[2] | ||
self._sum_sq_total_right -= current_node._left_statistics[2] | ||
self._count_right_total -= current_node._left_statistics[0] | ||
self._count_left_total += current_node._left_statistics[0] | ||
|
||
lhs_dist = {} | ||
rhs_dist = {} | ||
lhs_dist[0] = self._count_left_total | ||
lhs_dist[1] = self._sum_total_left | ||
lhs_dist[2] = self._sum_sq_total_left | ||
rhs_dist[0] = self._count_right_total | ||
rhs_dist[1] = self._sum_total_right | ||
rhs_dist[2] = self._sum_sq_total_right | ||
post_split_dists = [lhs_dist, rhs_dist] | ||
pre_split_dist = [(self._count_left_total + self._count_right_total), | ||
(self._sum_total_left + self._sum_total_right), | ||
(self._sum_sq_total_left + self._sum_sq_total_right)] | ||
|
||
merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists) | ||
|
||
if current_best_option is None or merit > current_best_option.merit: | ||
num_att_binary_test = NumericAttributeBinaryTest(att_idx, current_node._cut_point, True) | ||
current_best_option = AttributeSplitSuggestion(num_att_binary_test, post_split_dists, merit) | ||
|
||
if current_node._right is not None: | ||
current_best_option = self.search_for_best_split_option(current_node._right, current_best_option, criterion, | ||
att_idx) | ||
|
||
self._sum_total_left -= current_node._left_statistics.get(1) | ||
self._sum_total_right += current_node._left_statistics.get(1) | ||
self._sum_sq_total_left -= current_node._left_statistics.get(2) | ||
self._sum_sq_total_right += current_node._left_statistics.get(2) | ||
self._count_left_total -= current_node._left_statistics.get(0) | ||
self._count_right_total += current_node._left_statistics.get(0) | ||
|
||
return current_best_option | ||
|
||
def remove_bad_splits(self, criterion, last_check_ratio, last_check_sdr, last_check_e): | ||
|
||
self.remove_bad_split_nodes(criterion, self._root, last_check_ratio, last_check_sdr, last_check_e) | ||
|
||
def remove_bad_split_nodes(self, criterion, current_node, last_check_ratio, last_check_sdr, last_check_e): | ||
self._aux_sum_weight = 0 | ||
|
||
# Handles both single-target and multi-target tasks | ||
if np.ndim(pre_split_dist[1]) == 0: | ||
self._aux_sum = 0.0 | ||
self._aux_sum_sq = 0.0 | ||
else: | ||
self._aux_sum = np.zeros_like(pre_split_dist[1]) | ||
self._aux_sum_sq = np.zeros_like(pre_split_dist[2]) | ||
|
||
candidate = AttributeSplitSuggestion(None, [{}], -float('inf')) | ||
|
||
best_split = self._find_best_split(self._root, candidate) | ||
|
||
# Reset auxiliary variables | ||
self._criterion = None | ||
self._pre_split_dist = None | ||
self._att_idx = None | ||
self._aux_sum_weight = None | ||
self._aux_sum = None | ||
self._aux_sum_sq = None | ||
|
||
return best_split | ||
|
||
def _find_best_split(self, node, candidate): | ||
if node._left is not None: | ||
candidate = self._find_best_split(node._left, candidate) | ||
# Left post split distribution | ||
left_dist = {} | ||
left_dist[0] = node.sum_weight + self._aux_sum_weight | ||
left_dist[1] = node.sum_target + self._aux_sum | ||
left_dist[2] = node.sum_sq_target + self._aux_sum_sq | ||
|
||
# The right split distribution is calculated as the difference | ||
# between the total distribution (pre split distribution) and | ||
# the left distribution | ||
right_dist = {} | ||
right_dist[0] = self._pre_split_dist[0] - left_dist[0] | ||
right_dist[1] = self._pre_split_dist[1] - left_dist[1] | ||
right_dist[2] = self._pre_split_dist[2] - left_dist[2] | ||
|
||
post_split_dists = [left_dist, right_dist] | ||
|
||
merit = self._criterion.get_merit_of_split(self._pre_split_dist, | ||
post_split_dists) | ||
if merit > candidate.merit: | ||
num_att_binary_test = NumericAttributeBinaryTest(self._att_idx, | ||
node.att_val, | ||
True) | ||
candidate = AttributeSplitSuggestion(num_att_binary_test, | ||
post_split_dists, merit) | ||
|
||
if node._right is not None: | ||
self._aux_sum_weight += node.sum_weight | ||
self._aux_sum += node.sum_target | ||
self._aux_sum_sq += node.sum_sq_target | ||
|
||
right_candidate = self._find_best_split(node._right, candidate) | ||
|
||
if right_candidate.merit > candidate.merit: | ||
candidate = right_candidate | ||
|
||
self._aux_sum_weight -= node.sum_weight | ||
self._aux_sum -= node.sum_target | ||
self._aux_sum_sq -= node.sum_sq_target | ||
|
||
return candidate | ||
|
||
def remove_bad_splits(self, criterion, last_check_ratio, last_check_sdr, last_check_e, | ||
pre_split_dist): | ||
|
||
# Auxiliary variables | ||
self._criterion = criterion | ||
self._pre_split_dist = pre_split_dist | ||
self._last_check_ratio = last_check_ratio | ||
self._last_check_sdr = last_check_sdr | ||
self._last_check_e = last_check_e | ||
|
||
self._aux_sum_weight = 0 | ||
|
||
# Encompass both the single-target and multi-target cases | ||
if np.ndim(pre_split_dist[1]) == 0: | ||
self._aux_sum = 0.0 | ||
self._aux_sum_sq = 0.0 | ||
else: | ||
self._aux_sum = np.zeros_like(pre_split_dist[1]) | ||
self._aux_sum_sq = np.zeros_like(pre_split_dist[2]) | ||
|
||
self._remove_bad_split_nodes(self._root) | ||
|
||
# Reset auxiliary variables | ||
self._criterion = None | ||
self._pre_split_dist = None | ||
self._last_check_ratio = None | ||
self._last_check_sdr = None | ||
self._last_check_e = None | ||
self._aux_sum_weight = None | ||
self._aux_sum = None | ||
self._aux_sum_sq = None | ||
|
||
def _remove_bad_split_nodes(self, current_node): | ||
is_bad = False | ||
|
||
if current_node is None: | ||
return True | ||
|
||
if current_node._left is not None: | ||
is_bad = self.remove_bad_split_nodes(criterion, current_node._left, last_check_ratio, | ||
last_check_sdr, last_check_e) | ||
is_bad = self._remove_bad_split_nodes(current_node._left) | ||
|
||
if is_bad: | ||
current_node._left = None | ||
|
||
if current_node._right is not None and is_bad: | ||
is_bad = self.remove_bad_split_nodes(criterion, current_node._left, last_check_ratio, | ||
last_check_sdr, last_check_e) | ||
is_bad = self._remove_bad_split_nodes(current_node._right) | ||
|
||
if is_bad: | ||
current_node._right = None | ||
|
||
if is_bad: | ||
left_stat = [current_node._left_statistics[0], current_node._left_statistics[1], | ||
current_node._left_statistics[2]] | ||
right_stat = [current_node._right_statistics[0], current_node._right_statistics[1], | ||
current_node._right_statistics[2]] | ||
|
||
post_split_dists = [left_stat, right_stat] | ||
pre_split_dist = [(current_node._left_statistics.get(0) + current_node._right_statistics.get(0)), | ||
(current_node._left_statistics.get(1) + current_node._right_statistics.get(1)), | ||
(current_node._left_statistics.get(2) + current_node._right_statistics.get(2))] | ||
merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists) | ||
if (merit / last_check_sdr) < (last_check_ratio - (2 * last_check_e)): | ||
current_node = None | ||
# Left post split distribution | ||
left_dist = {} | ||
left_dist[0] = current_node.sum_weight + self._aux_sum_weight | ||
left_dist[1] = current_node.sum_target + self._aux_sum | ||
left_dist[2] = current_node.sum_sq_target + self._aux_sum_sq | ||
|
||
# The right split distribution is calculated as the difference | ||
# between the total distribution (pre split distribution) and | ||
# the left distribution | ||
right_dist = {} | ||
right_dist[0] = self._pre_split_dist[0] - left_dist[0] | ||
right_dist[1] = self._pre_split_dist[1] - left_dist[1] | ||
right_dist[2] = self._pre_split_dist[2] - left_dist[2] | ||
|
||
post_split_dists = [left_dist, right_dist] | ||
merit = self._criterion.get_merit_of_split(self._pre_split_dist, post_split_dists) | ||
if (merit / self._last_check_sdr) < (self._last_check_ratio - | ||
(2 * self._last_check_e)): | ||
return True | ||
|
||
return False |
Oops, something went wrong.