Add streamline regression attribute observer

scikit-multiflow · Apr 22, 2020 · b3ceddd · b3ceddd
1 parent e4ac207
commit b3ceddd
Show file tree

Hide file tree

Showing 6 changed files with 198 additions and 295 deletions.
diff --git a/src/skmultiflow/trees/attribute_observer/__init__.py b/src/skmultiflow/trees/attribute_observer/__init__.py
@@ -5,9 +5,7 @@
 from .numeric_attribute_class_observer_binary_tree import NumericAttributeClassObserverBinaryTree
 from .numeric_attribute_class_observer_gaussian import NumericAttributeClassObserverGaussian
 from .numeric_attribute_regression_observer import NumericAttributeRegressionObserver
-from .numeric_attribute_regression_observer_multi_target import NumericAttributeRegressionObserverMultiTarget
 
 __all__ = ["AttributeClassObserver", "AttributeClassObserverNull", "NominalAttributeClassObserver",
            "NominalAttributeRegressionObserver", "NumericAttributeClassObserverBinaryTree",
-           "NumericAttributeClassObserverGaussian", "NumericAttributeRegressionObserver",
-           "NumericAttributeRegressionObserverMultiTarget"]
+           "NumericAttributeClassObserverGaussian", "NumericAttributeRegressionObserver"]
diff --git a/src/skmultiflow/trees/attribute_observer/numeric_attribute_regression_observer.py b/src/skmultiflow/trees/attribute_observer/numeric_attribute_regression_observer.py
@@ -1,182 +1,231 @@
-from skmultiflow.trees.attribute_observer import NumericAttributeClassObserverBinaryTree
+import math
+import numpy as np
+from skmultiflow.trees.attribute_observer import AttributeClassObserver
 from skmultiflow.trees.attribute_test import NumericAttributeBinaryTest
 from skmultiflow.trees.attribute_split_suggestion import AttributeSplitSuggestion
 
 
-class NumericAttributeRegressionObserver(NumericAttributeClassObserverBinaryTree):
-    """ Class for observing the data distribution for a numeric attribute for regression.
+class NumericAttributeRegressionObserver(AttributeClassObserver):
+    """iSoup-Tree's Extended Binary Search Tree (E-BST)
 
+    This class implements the Extended Binary Search Tree (E-BST)
+    structure, using the variant employed by Osojnik et al. [1]_ in the
+    iSOUP-Tree algorithm. This structure is employed to observe the target
+    space distribution.
+
+    In this variant, only the left branch statistics are stored.
+
+    References
+    ----------
+    .. [1] Osojnik, Aljaž. 2017. Structured output prediction on Data
+       Streams (Doctoral Dissertation). Retrieved from:
+       http://kt.ijs.si/theses/phd_aljaz_osojnik.pdf
     """
 
     class Node:
+        def __init__(self, att_val, target, weight):
+            self.att_val = att_val
+
+            self.sum_weight = weight
+            self.sum_target = weight * target
+            self.sum_sq_target = weight * target * target
 
-        def __init__(self, val, label):
-            self._left_statistics = {}
-            self._right_statistics = {}
             self._left = None
             self._right = None
-            self._cut_point = val
-            try:
-                self._left_statistics[0] += 1
-                self._left_statistics[1] += label
-                self._left_statistics[2] += label * label
-            except KeyError:
-                self._left_statistics[0] = 1
-                self._left_statistics[1] = label
-                self._left_statistics[2] = label * label
-        """
-         /**
-         * Insert a new value into the tree, updating both the sum of values and
-         * sum of squared values arrays
-         */
-
-        """
-
-        def insert_value(self, val, label):
-            if val == self._cut_point:
-
-                self._left_statistics[0] += 1
-                self._left_statistics[1] += label
-                self._left_statistics[2] += label * label
-
-            elif val < self._cut_point:
-
-                self._left_statistics[0] += 1
-                self._left_statistics[1] += label
-                self._left_statistics[2] += label * label
-                if self._left is None:
-                    self._left = NumericAttributeRegressionObserver.Node(val, label)
+
+        # Incremental implementation of the insert method. Avoiding unecessary
+        # stack tracing must decrease memory costs
+        def insert_value(self, att_val, target, weight):
+            current = self
+            antecedent = None
+
+            while current is not None:
+                antecedent = current
+                if math.isclose(att_val, current.att_val):
+                    current.sum_weight += weight
+                    current.sum_target += weight * target
+                    current.sum_sq_target += weight * target * target
+                    return
+                elif att_val < current.att_val:
+                    current.sum_weight += weight
+                    current.sum_target += weight * target
+                    current.sum_sq_target += weight * target * target
+
+                    current = current._left
+                    is_right = False
                 else:
-                    self._left.insert_value(val, label)
+                    current = current._right
+                    is_right = True
+
+            # Value was not yet added to the tree
+            if is_right:
+                antecedent._right = NumericAttributeRegressionObserver.Node(
+                    att_val, target, weight
+                )
             else:
-                try:
-                    self._right_statistics[0] += 1
-                    self._right_statistics[1] += label
-                    self._right_statistics[2] += label * label
-                except KeyError:
-                    self._right_statistics[0] = 1
-                    self._right_statistics[1] = label
-                    self._right_statistics[2] = label * label
-
-                if self._right is None:
-                    self._right = NumericAttributeRegressionObserver.Node(val, label)
-                else:
-                    self._right.insert_value(val, label)
-
-    """
-    end of class Node
-
-    """
+                antecedent._left = NumericAttributeRegressionObserver.Node(
+                    att_val, target, weight
+                )
 
     def __init__(self):
         super().__init__()
         self._root = None
-        self._sum_total_left = 0.0
-        self._sum_total_right = 0.0
-        self._sum_sq_total_left = 0.0
-        self._sum_sq_total_right = 0.0
-        self._count_right_total = 0.0
-        self._count_left_total = 0.0
 
     def observe_attribute_class(self, att_val, class_val, weight):
-
         if att_val is None:
             return
-
         else:
             if self._root is None:
-                self._root = NumericAttributeRegressionObserver.Node(att_val, class_val)
+                self._root = NumericAttributeRegressionObserver.Node(att_val, class_val, weight)
             else:
-                self._root.insert_value(att_val, class_val)
+                self._root.insert_value(att_val, class_val, weight)
 
     def probability_of_attribute_value_given_class(self, att_val, class_val):
-        return 0.0
-
-    def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist, att_idx, binary_only):
+        raise NotImplementedError
 
-        self._sum_total_left = 0.0
-        self._sum_total_right = pre_split_dist[1]
-        self._sum_sq_total_left = 0.0
-        self._sum_sq_total_right = pre_split_dist[2]
-        self._count_left_total = 0.0
-        self._count_right_total = pre_split_dist[0]
-        return self.search_for_best_split_option(self._root, None, criterion, att_idx)
+    def get_best_evaluated_split_suggestion(self, criterion, pre_split_dist,
+                                            att_idx, binary_only=True):
+        self._criterion = criterion
+        self._pre_split_dist = pre_split_dist
+        self._att_idx = att_idx
 
-    def search_for_best_split_option(self, current_node, current_best_option, criterion, att_idx):
-        if current_node is None or self._count_right_total == 0:
-            return current_best_option
-        if current_node._left is not None:
-            current_best_option = self.search_for_best_split_option(current_node._left, current_best_option,
-                                                                    criterion, att_idx)
-        self._sum_total_left += current_node._left_statistics[1]
-        self._sum_total_right -= current_node._left_statistics[1]
-        self._sum_sq_total_left += current_node._left_statistics[2]
-        self._sum_sq_total_right -= current_node._left_statistics[2]
-        self._count_right_total -= current_node._left_statistics[0]
-        self._count_left_total += current_node._left_statistics[0]
-
-        lhs_dist = {}
-        rhs_dist = {}
-        lhs_dist[0] = self._count_left_total
-        lhs_dist[1] = self._sum_total_left
-        lhs_dist[2] = self._sum_sq_total_left
-        rhs_dist[0] = self._count_right_total
-        rhs_dist[1] = self._sum_total_right
-        rhs_dist[2] = self._sum_sq_total_right
-        post_split_dists = [lhs_dist, rhs_dist]
-        pre_split_dist = [(self._count_left_total + self._count_right_total),
-                          (self._sum_total_left + self._sum_total_right),
-                          (self._sum_sq_total_left + self._sum_sq_total_right)]
-
-        merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)
-
-        if current_best_option is None or merit > current_best_option.merit:
-            num_att_binary_test = NumericAttributeBinaryTest(att_idx, current_node._cut_point, True)
-            current_best_option = AttributeSplitSuggestion(num_att_binary_test, post_split_dists, merit)
-
-        if current_node._right is not None:
-            current_best_option = self.search_for_best_split_option(current_node._right, current_best_option, criterion,
-                                                                    att_idx)
-
-        self._sum_total_left -= current_node._left_statistics.get(1)
-        self._sum_total_right += current_node._left_statistics.get(1)
-        self._sum_sq_total_left -= current_node._left_statistics.get(2)
-        self._sum_sq_total_right += current_node._left_statistics.get(2)
-        self._count_left_total -= current_node._left_statistics.get(0)
-        self._count_right_total += current_node._left_statistics.get(0)
-
-        return current_best_option
-
-    def remove_bad_splits(self, criterion, last_check_ratio, last_check_sdr, last_check_e):
-
-        self.remove_bad_split_nodes(criterion, self._root, last_check_ratio, last_check_sdr, last_check_e)
-
-    def remove_bad_split_nodes(self, criterion, current_node, last_check_ratio, last_check_sdr, last_check_e):
+        self._aux_sum_weight = 0
 
+        # Handles both single-target and multi-target tasks
+        if np.ndim(pre_split_dist[1]) == 0:
+            self._aux_sum = 0.0
+            self._aux_sum_sq = 0.0
+        else:
+            self._aux_sum = np.zeros_like(pre_split_dist[1])
+            self._aux_sum_sq = np.zeros_like(pre_split_dist[2])
+
+        candidate = AttributeSplitSuggestion(None, [{}], -float('inf'))
+
+        best_split = self._find_best_split(self._root, candidate)
+
+        # Reset auxiliary variables
+        self._criterion = None
+        self._pre_split_dist = None
+        self._att_idx = None
+        self._aux_sum_weight = None
+        self._aux_sum = None
+        self._aux_sum_sq = None
+
+        return best_split
+
+    def _find_best_split(self, node, candidate):
+        if node._left is not None:
+            candidate = self._find_best_split(node._left, candidate)
+        # Left post split distribution
+        left_dist = {}
+        left_dist[0] = node.sum_weight + self._aux_sum_weight
+        left_dist[1] = node.sum_target + self._aux_sum
+        left_dist[2] = node.sum_sq_target + self._aux_sum_sq
+
+        # The right split distribution is calculated as the difference
+        # between the total distribution (pre split distribution) and
+        # the left distribution
+        right_dist = {}
+        right_dist[0] = self._pre_split_dist[0] - left_dist[0]
+        right_dist[1] = self._pre_split_dist[1] - left_dist[1]
+        right_dist[2] = self._pre_split_dist[2] - left_dist[2]
+
+        post_split_dists = [left_dist, right_dist]
+
+        merit = self._criterion.get_merit_of_split(self._pre_split_dist,
+                                                   post_split_dists)
+        if merit > candidate.merit:
+            num_att_binary_test = NumericAttributeBinaryTest(self._att_idx,
+                                                             node.att_val,
+                                                             True)
+            candidate = AttributeSplitSuggestion(num_att_binary_test,
+                                                 post_split_dists, merit)
+
+        if node._right is not None:
+            self._aux_sum_weight += node.sum_weight
+            self._aux_sum += node.sum_target
+            self._aux_sum_sq += node.sum_sq_target
+
+            right_candidate = self._find_best_split(node._right, candidate)
+
+            if right_candidate.merit > candidate.merit:
+                candidate = right_candidate
+
+            self._aux_sum_weight -= node.sum_weight
+            self._aux_sum -= node.sum_target
+            self._aux_sum_sq -= node.sum_sq_target
+
+        return candidate
+
+    def remove_bad_splits(self, criterion, last_check_ratio, last_check_sdr, last_check_e,
+                          pre_split_dist):
+
+        # Auxiliary variables
+        self._criterion = criterion
+        self._pre_split_dist = pre_split_dist
+        self._last_check_ratio = last_check_ratio
+        self._last_check_sdr = last_check_sdr
+        self._last_check_e = last_check_e
+
+        self._aux_sum_weight = 0
+
+        # Encompass both the single-target and multi-target cases
+        if np.ndim(pre_split_dist[1]) == 0:
+            self._aux_sum = 0.0
+            self._aux_sum_sq = 0.0
+        else:
+            self._aux_sum = np.zeros_like(pre_split_dist[1])
+            self._aux_sum_sq = np.zeros_like(pre_split_dist[2])
+
+        self._remove_bad_split_nodes(self._root)
+
+        # Reset auxiliary variables
+        self._criterion = None
+        self._pre_split_dist = None
+        self._last_check_ratio = None
+        self._last_check_sdr = None
+        self._last_check_e = None
+        self._aux_sum_weight = None
+        self._aux_sum = None
+        self._aux_sum_sq = None
+
+    def _remove_bad_split_nodes(self, current_node):
         is_bad = False
 
         if current_node is None:
             return True
 
         if current_node._left is not None:
-            is_bad = self.remove_bad_split_nodes(criterion, current_node._left, last_check_ratio,
-                                                 last_check_sdr, last_check_e)
+            is_bad = self._remove_bad_split_nodes(current_node._left)
+
+            if is_bad:
+                current_node._left = None
+
         if current_node._right is not None and is_bad:
-            is_bad = self.remove_bad_split_nodes(criterion, current_node._left, last_check_ratio,
-                                                 last_check_sdr, last_check_e)
+            is_bad = self._remove_bad_split_nodes(current_node._right)
+
+            if is_bad:
+                current_node._right = None
+
         if is_bad:
-            left_stat = [current_node._left_statistics[0], current_node._left_statistics[1],
-                         current_node._left_statistics[2]]
-            right_stat = [current_node._right_statistics[0], current_node._right_statistics[1],
-                          current_node._right_statistics[2]]
-
-            post_split_dists = [left_stat, right_stat]
-            pre_split_dist = [(current_node._left_statistics.get(0) + current_node._right_statistics.get(0)),
-                               (current_node._left_statistics.get(1) + current_node._right_statistics.get(1)),
-                               (current_node._left_statistics.get(2) + current_node._right_statistics.get(2))]
-            merit = criterion.get_merit_of_split(pre_split_dist, post_split_dists)
-            if (merit / last_check_sdr) < (last_check_ratio - (2 * last_check_e)):
-                current_node = None
+            # Left post split distribution
+            left_dist = {}
+            left_dist[0] = current_node.sum_weight + self._aux_sum_weight
+            left_dist[1] = current_node.sum_target + self._aux_sum
+            left_dist[2] = current_node.sum_sq_target + self._aux_sum_sq
+
+            # The right split distribution is calculated as the difference
+            # between the total distribution (pre split distribution) and
+            # the left distribution
+            right_dist = {}
+            right_dist[0] = self._pre_split_dist[0] - left_dist[0]
+            right_dist[1] = self._pre_split_dist[1] - left_dist[1]
+            right_dist[2] = self._pre_split_dist[2] - left_dist[2]
+
+            post_split_dists = [left_dist, right_dist]
+            merit = self._criterion.get_merit_of_split(self._pre_split_dist, post_split_dists)
+            if (merit / self._last_check_sdr) < (self._last_check_ratio -
+                                                 (2 * self._last_check_e)):
                 return True
 
         return False