from skmultiflow.classification.trees.hoeffding_tree import HoeffdingTree from skmultiflow.classification.core.split_criteria.multilabel_info_gain_split_criterion \ import MutltiLabelInfoGainSplitCriterion from operator import attrgetter from skmultiflow.core.utils.utils import * from skmultiflow.classification.core.attribute_class_observers.gaussian_numeric_attribute_class_observer \ import GaussianNumericAttributeClassObserver from skmultiflow.classification.core.attribute_class_observers.nominal_attribute_class_observer \ import NominalAttributeClassObserver MAJORITY_CLASS = 'mc' class MultilabelHF(HoeffdingTree): class MultilabelInactiveNode(HoeffdingTree.InactiveLearningNode): def __init__(self, initial_class_observations=None, n_observations=None): """ InactiveLearningNode class constructor. """ if n_observations is None: self.n_observations = 0 else: self.n_observations = n_observations super().__init__(initial_class_observations) def learn_from_instance(self, X, y, weight, ht): self.n_observations += 1 for l in range(len(y)): if y[l] == 1: try: self._observed_class_distribution[l] += weight except KeyError: self._observed_class_distribution[l] = weight else: try: self._observed_class_distribution[l] += 0 except KeyError: self._observed_class_distribution[l] = 0 class MultilabelLearningNode(HoeffdingTree.ActiveLearningNode): def __init__(self, initial_class_observations, n_observations): if n_observations is None: self.n_observations = 0 else: self.n_observations = n_observations super().__init__(initial_class_observations) def learn_from_instance(self, X, y, weight, ht): self.n_observations += 1 for l in range(len(y)): if y[l] == 1: try: self._observed_class_distribution[l] += weight except KeyError: self._observed_class_distribution[l] = weight else: try: self._observed_class_distribution[l] += 0 except KeyError: self._observed_class_distribution[l] = 0 for i in range(len(X)): try: obs = self._attribute_observers[i] except KeyError: if i in ht.nominal_attributes: obs = NominalAttributeClassObserver() else: obs = GaussianNumericAttributeClassObserver() self._attribute_observers[i] = obs for l in range(len(y)): obs.observe_attribute_class(X[i], l, weight if y[l] == 1 else 0) def get_weight_seen(self): """Calculate the total weight seen by the node. Returns ------- float Total weight seen. """ return self.n_observations def _new_learning_node(self, initial_class_observations=None, n_observations=None): """Create a new learning node. The type of learning node depends on the tree configuration.""" if initial_class_observations is None: initial_class_observations = {} if self._leaf_prediction == MAJORITY_CLASS: return self.MultilabelLearningNode(initial_class_observations, n_observations) def _deactivate_learning_node(self, to_deactivate: MultilabelLearningNode, parent: HoeffdingTree.SplitNode, parent_branch: int): """Deactivate a learning node. Parameters ---------- to_deactivate: ActiveLearningNode The node to deactivate. parent: SplitNode The node's parent. parent_branch: int Parent node's branch index. """ new_leaf = self.MultilabelInactiveNode(to_deactivate.get_observed_class_distribution()) if parent is None: self._tree_root = new_leaf else: parent.set_child(parent_branch, new_leaf) self._active_leaf_node_cnt -= 1 self._inactive_leaf_node_cnt += 1 def __init__(self, max_byte_size=33554432, memory_estimate_period=1000000, grace_period=200, split_criterion='info_gain', split_confidence=0.0000001, tie_threshold=0.05, binary_split=False, stop_mem_management=False, remove_poor_atts=False, no_preprune=False, leaf_prediction='mc', nb_threshold=0, nominal_attributes=None): super().__init__(max_byte_size, memory_estimate_period, grace_period, split_criterion, split_confidence, tie_threshold, binary_split, stop_mem_management, remove_poor_atts, no_preprune, leaf_prediction, nb_threshold, nominal_attributes) self.n_labels = 1 def _attempt_to_split(self, node: MultilabelLearningNode, parent: HoeffdingTree.SplitNode, parent_idx: int): """Attempt to split a node. If the samples seen so far are not from the same class then: 1. Find split candidates and select the top 2. 2. Compute the Hoeffding bound. 3. If the difference between the top 2 split candidates is larger than the Hoeffding bound: 3.1 Replace the leaf node by a split node. 3.2 Add a new leaf node on each branch of the new split node. 3.3 Update tree's metrics Optional: Disable poor attribute. Depends on the tree's configuration. Parameters ---------- node: ActiveLearningNode The node to evaluate. parent: SplitNode The node's parent. parent_idx: int Parent node's branch index. """ split_criterion = MutltiLabelInfoGainSplitCriterion() best_split_suggestions = node.get_best_split_suggestions(split_criterion, self) best_split_suggestions.sort(key=attrgetter('merit')) should_split = False if len(best_split_suggestions) < 2: should_split = len(best_split_suggestions) > 0 else: hoeffding_bound = self.compute_hoeffding_bound(np.log2(2), self.split_confidence, node.get_weight_seen()) best_suggestion = best_split_suggestions[-1] second_best_suggestion = best_split_suggestions[-2] if (best_suggestion.merit - second_best_suggestion.merit > hoeffding_bound or hoeffding_bound < self.tie_threshold): # best_suggestion.merit > 1e-10 and \ should_split = True if self.remove_poor_atts is not None and self.remove_poor_atts: poor_atts = set() # Scan 1 - add any poor attribute to set for i in range(len(best_split_suggestions)): if best_split_suggestions[i] is not None: split_atts = best_split_suggestions[i].split_test.get_atts_test_depends_on() if len(split_atts) == 1: if best_suggestion.merit - best_split_suggestions[i].merit > hoeffding_bound: poor_atts.add(int(split_atts[0])) # Scan 2 - remove good attributes from set for i in range(len(best_split_suggestions)): if best_split_suggestions[i] is not None: split_atts = best_split_suggestions[i].split_test.get_atts_test_depends_on() if len(split_atts) == 1: if best_suggestion.merit - best_split_suggestions[i].merit < hoeffding_bound: poor_atts.remove(int(split_atts[0])) for poor_att in poor_atts: node.disable_attribute(poor_att) if should_split: split_decision = best_split_suggestions[-1] if split_decision.split_test is None: # Preprune - null wins self._deactivate_learning_node(node, parent, parent_idx) else: new_split = self.new_split_node(split_decision.split_test, node.get_observed_class_distribution()) for i in range(split_decision.num_splits()): new_child = self._new_learning_node(split_decision.resulting_class_distribution_from_split(i), node.n_observations) new_split.set_child(i, new_child) self._active_leaf_node_cnt -= 1 self._decision_node_cnt += 1 self._active_leaf_node_cnt += split_decision.num_splits() if parent is None: self._tree_root = new_split else: parent.set_child(parent_idx, new_split) # Manage memory self.enforce_tracker_limit() def get_n_observations_for_instance(self, X): if self._tree_root is not None: found_node = self._tree_root.filter_instance_to_leaf(X, None, -1) leaf_node = found_node.node if leaf_node is None: leaf_node = found_node.parent return leaf_node.n_observations else: return 0 def predict_proba(self, X): """Predicts probabilities of all label of the X instance(s) Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Samples for which we want to predict the labels. Returns ------- list Predicted the probabilities of all the labels for all instances in X. """ r, _ = get_dimensions(X) predictions = [] for i in range(r): votes = self.get_votes_for_instance(X[i]) if votes == {}: # Tree is empty, all target_values equal, default to zero predictions.append([0]*self.n_labels) else: y_proba = [] for l in range(self.n_labels): y_proba.append(votes[l]/self.get_n_observations_for_instance(X[i]) if self.get_n_observations_for_instance(X[i]) != 0 else 0) predictions.append(y_proba) return predictions def predict(self, X): pred = [] Y_proba = self.predict_proba(X) for y_proba in Y_proba: pred.append([1 if (1 - proba <= proba) else 0 for proba in y_proba]) return pred def partial_fit(self, X, y, classes=None, weight=None): """Incrementally trains the model. Train samples (instances) are compossed of X attributes and their corresponding targets y. Tasks performed before training: * Verify instance weight. if not provided, uniform weights (1.0) are assumed. * If more than one instance is passed, loop through X and pass instances one at a time. * Update weight seen by model. Training tasks: * If the tree is empty, create a leaf node as the root. * If the tree is already initialized, find the corresponding leaf for the instance and update the leaf node statistics. * If growth is allowed and the number of instances that the leaf has observed between split attempts exceed the grace period then attempt to split. Parameters ---------- X: numpy.ndarray of shape (n_samples, n_features) Instance attributes. y: array_like Classes (targets) for all samples in X. classes: Not used. weight: float or array-like Instance weight. If not provided, uniform weights are assumed. """ if y is not None: self.n_labels = len(y[0]) if weight is None: weight = np.array([1.0]) row_cnt, _ = get_dimensions(X) wrow_cnt, _ = get_dimensions(weight) if row_cnt != wrow_cnt: weight = [weight[0]] * row_cnt for i in range(row_cnt): if weight[i] != 0.0: self._train_weight_seen_by_model += weight[i] self._partial_fit(X[i], y[i], weight[i])