src/skmultiflow/meta/oza_bagging.py

import copy as cp
import warnings

import numpy as np

from skmultiflow.core import BaseSKMObject, ClassifierMixin, MetaEstimatorMixin
from skmultiflow.lazy import KNNADWINClassifier
from skmultiflow.utils import check_random_state, get_dimensions


def OzaBagging(base_estimator=KNNADWINClassifier(), n_estimators=10,
               random_state=None):  # pragma: no cover
    warnings.warn("'OzaBagging' has been renamed to 'OzaBaggingClassifier' in v0.5.0.\n"
                  "The old name will be removed in v0.7.0", category=FutureWarning)
    return OzaBaggingClassifier(base_estimator=base_estimator,
                                n_estimators=n_estimators,
                                random_state=random_state)


class OzaBaggingClassifier(BaseSKMObject, ClassifierMixin, MetaEstimatorMixin):
    """ Oza Bagging ensemble classifier.

    Parameters
    ----------
    base_estimator: skmultiflow.core.BaseSKMObject or sklearn.BaseEstimator
        (default=KNNADWINClassifier) Each member of the ensemble is an instance of
        the base estimator.

    n_estimators: int (default=10)
        The size of the ensemble, in other words, how many classifiers to train.

    random_state: int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used by `np.random`.

    Raises
    ------
    ValueError: A ValueError is raised if the 'classes' parameter is
    not passed in the first partial_fit call.

    Notes
    -----
    Oza Bagging [1]_ is an ensemble learning method first introduced by Oza and
    Russel's 'Online Bagging and Boosting'. They are an improvement of the
    well known Bagging ensemble method for the batch setting, which in this
    version can effectively handle data streams.

    For a traditional Bagging algorithm, adapted for the batch setting, we
    would have M classifiers training on M different datasets, created by
    drawing N samples from the N-sized training set with replacement.

    In the online context, since there is no training dataset, but a stream
    of samples, the drawing of samples with replacement can't be trivially
    executed. The strategy adopted by the Online Bagging algorithm is to
    simulate this task by training each arriving sample K times, which is
    drawn by the binomial distribution. Since we can consider the data stream
    to be infinite, and knowing that with infinite samples the binomial
    distribution tends to a Poisson(1) distribution, Oza and Russel found
    that to be a good 'drawing with replacement'.

    References
    ----------
    .. [1] N. C. Oza, “Online Bagging and Boosting,” in 2005 IEEE International Conference
        on Systems, Man and Cybernetics, 2005, vol. 3, no. 3, pp. 2340–2345.

    Examples
    --------
    >>> # Imports
    >>> from skmultiflow.meta import OzaBaggingClassifier
    >>> from skmultiflow.lazy import KNNClassifier
    >>> from skmultiflow.data import SEAGenerator
    >>> # Setting up the stream
    >>> stream = SEAGenerator(1, noise_percentage=0.07)
    >>> # Setting up the OzaBagging classifier to work with KNN as base estimator
    >>> clf = OzaBaggingClassifier(base_estimator=KNNClassifier(n_neighbors=8,
    ...     max_window_size=2000, leaf_size=30), n_estimators=2)
    >>> # Keeping track of sample count and correct prediction count
    >>> sample_count = 0
    >>> corrects = 0
    >>> # Pre training the classifier with 200 samples
    >>> X, y = stream.next_sample(200)
    >>> clf = clf.partial_fit(X, y, classes=stream.target_values)
    >>> for i in range(2000):
    ...     X, y = stream.next_sample()
    ...     pred = clf.predict(X)
    ...     clf = clf.partial_fit(X, y)
    ...     if pred is not None:
    ...         if y[0] == pred[0]:
    ...             corrects += 1
    ...     sample_count += 1
    >>>
    >>> # Displaying the results
    >>> print(str(sample_count) + ' samples analyzed.')
    2000 samples analyzed.
    >>> print('OzaBaggingClassifier performance: ' + str(corrects / sample_count))
    OzaBagging classifier performance: 0.9095

    """

    def __init__(self, base_estimator=KNNADWINClassifier(), n_estimators=10, random_state=None):
        super().__init__()
        # default values
        self.ensemble = None
        self.actual_n_estimators = None
        self.classes = None
        self._random_state = None  # This is the actual random_state object used internally
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.__configure()

    def __configure(self):
        if hasattr(self.base_estimator, "reset"):
            self.base_estimator.reset()
        self.actual_n_estimators = self.n_estimators
        self.ensemble = [cp.deepcopy(self.base_estimator) for _ in range(self.actual_n_estimators)]
        self._random_state = check_random_state(self.random_state)

    def reset(self):
        self.__configure()
        return self

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """
        Partially (incrementally) fit the model.

        Parameters
        ----------
        X : numpy.ndarray of shape (n_samples, n_features)
            The features to train the model.

        y: numpy.ndarray of shape (n_samples)
            An array-like with the class labels of all samples in X.

        classes: numpy.ndarray, optional (default=None)
            Array with all possible/known class labels. This is an optional parameter, except
            for the first partial_fit call where it is compulsory.

        sample_weight: numpy.ndarray of shape (n_samples), optional (default=None)
            Samples weight. If not provided, uniform weights are assumed. Usage varies depending
            on the base estimator.

        Raises
        ------
        ValueError
            A ValueError is raised if the 'classes' parameter is not passed in the first
            partial_fit call, or if they are passed in further calls but differ from
            the initial classes list passed.

        Returns
        -------
        OzaBaggingClassifier
            self

        Notes
        -----
        Since it's an ensemble learner, if X and y matrix of more than one
        sample are passed, the algorithm will partial fit the model one sample
        at a time.

        Each sample is trained by each classifier a total of K times, where K
        is drawn by a Poisson(1) distribution.

        """
        if self.classes is None:
            if classes is None:
                raise ValueError("The first partial_fit call should pass all the classes.")
            else:
                self.classes = classes

        if self.classes is not None and classes is not None:
            if set(self.classes) == set(classes):
                pass
            else:
                raise ValueError("The classes passed to the partial_fit function"
                                 "differ from those passed earlier.")

        self.__adjust_ensemble_size()
        r, _ = get_dimensions(X)
        for j in range(r):
            for i in range(self.actual_n_estimators):
                k = self._random_state.poisson()
                if k > 0:
                    for b in range(k):
                        self.ensemble[i].partial_fit([X[j]], [y[j]], classes, sample_weight)
        return self

    def __adjust_ensemble_size(self):
        if len(self.classes) != len(self.ensemble):
            if len(self.classes) > len(self.ensemble):
                for i in range(len(self.ensemble), len(self.classes)):
                    self.ensemble.append(cp.deepcopy(self.base_estimator))
                    self.actual_n_estimators += 1

    def predict(self, X):
        """ Predict classes for the passed data.

        Parameters
        ----------
        X : numpy.ndarray of shape (n_samples, n_features)
            The set of data samples to predict the class labels for.

        Returns
        -------
        A numpy.ndarray with all the predictions for the samples in X.

        Notes
        -----
        The predict function will average the predictions from all its learners
        to find the most likely prediction for the sample matrix X.

        """
        r, c = get_dimensions(X)
        proba = self.predict_proba(X)
        predictions = []
        if proba is None:
            return None
        for i in range(r):
            predictions.append(np.argmax(proba[i]))
        return np.asarray(predictions)

    def predict_proba(self, X):
        """ Estimates the probability of each sample in X belonging to each of the class-labels.

        Parameters
        ----------
        X : numpy.ndarray of shape (n_samples, n_features)
            The matrix of samples one wants to predict the class probabilities for.

        Returns
        -------
        A numpy.ndarray of shape (n_samples, n_labels), in which each outer entry
        is associated with the X entry of the same index. And where the list in index [i] contains
        len(self.target_values) elements, each of which represents the probability
        that the i-th sample of X belongs to a certain class-label.

        Raises
        ------
        ValueError: A ValueError is raised if the number of classes in the base_estimator
        learner differs from that of the ensemble learner.

        """
        proba = []
        r, c = get_dimensions(X)
        try:
            for i in range(self.actual_n_estimators):
                partial_proba = self.ensemble[i].predict_proba(X)
                if len(partial_proba[0]) > max(self.classes) + 1:
                    raise ValueError("The number of classes in the base learner "
                                     "is larger than in the ensemble.")

                if len(proba) < 1:
                    for n in range(r):
                        proba.append([0.0 for _ in partial_proba[n]])

                for n in range(r):
                    for k in range(len(partial_proba[n])):
                        try:
                            proba[n][k] += partial_proba[n][k]
                        except IndexError:
                            proba[n].append(partial_proba[n][k])
        except ValueError:
            return np.zeros((r, 1))
        except TypeError:
            return np.zeros((r, 1))

        # normalizing probabilities
        sum_proba = []
        for k in range(r):
            sum_proba.append(np.sum(proba[k]))
        aux = []
        for i in range(len(proba)):
            if sum_proba[i] > 0.:
                aux.append([x / sum_proba[i] for x in proba[i]])
            else:
                aux.append(proba[i])
        return np.asarray(aux)