scikit-learn-contrib · balvisio · Sep 18, 2022 · Sep 18, 2022 · Sep 18, 2022 · Sep 19, 2022
diff --git a/README.rst b/README.rst
@@ -185,6 +185,7 @@ Below is a list of the methods currently implemented in this module.
     7. ADASYN - Adaptive synthetic sampling approach for imbalanced learning [15]_
     8. KMeans-SMOTE [17]_
     9. ROSE - Random OverSampling Examples [19]_
+    10. MLSMOTE - Multilabel Synthetic Minority Over-sampling Technique [20]_
 
 * Over-sampling followed by under-sampling
     1. SMOTE + Tomek links [12]_
@@ -243,3 +244,5 @@ References:
 .. [18] : Seiffert, C., Khoshgoftaar, T. M., Van Hulse, J., & Napolitano, A. "RUSBoost: A hybrid approach to alleviating class imbalance." IEEE Transactions on Systems, Man, and Cybernetics-Part A: Systems and Humans 40.1 (2010): 185-197.
 
 .. [19] : Menardi, G., Torelli, N.: "Training and assessing classification rules with unbalanced data", Data Mining and Knowledge Discovery,  28, (2014): 92–122
+
+.. [20] : Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
@@ -11,6 +11,7 @@
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
 from ._smote import SMOTEN
+from ._mlsmote import MLSMOTE
 
 __all__ = [
     "ADASYN",
@@ -21,4 +22,5 @@
     "SVMSMOTE",
     "SMOTENC",
     "SMOTEN",
+    "MLSMOTE",
 ]
diff --git a/imblearn/over_sampling/_mlsmote.py b/imblearn/over_sampling/_mlsmote.py
@@ -0,0 +1,365 @@
+"""Class to perfrom over-sampling using MLSMOTE."""
+
+from itertools import combinations
+import numpy as np
+from scipy import sparse
+
+from sklearn.utils import check_random_state
+
+
+class MLSMOTE:
+    """Over-sampling using MLSMOTE.
+
+    Parameters
+    ----------
+    sampling_strategy: 'ranking', 'union' or 'intersection' default: 'ranking'
+        Strategy to generate labelsets
+
+    k_neighbors : int or object, default=5
+        If ``int``, number of nearest neighbors used to construct synthetic
+        samples.
+
+    categorical_features : ndarray of shape (n_cat_features,) or (n_features,)
+        Specifies which features are categorical. Can either be:
+
+        - array of indices specifying the categorical features;
+        - mask array of shape (n_features, ) and ``bool`` dtype for which
+          ``True`` indicates the categorical features.
+
+    Notes
+    -----
+    The implementation is based on [1]_.
+
+    References
+    ----------
+    .. [1] Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera,
+           Francisco. (2015). "MLSMOTE: Approaching imbalanced multilabel learning
+           through synthetic instance generation."
+           Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> from imblearn.over_sampling import MLSMOTE
+    >>> X, y = make_multilabel_classification(n_classes=5, n_features=20,
+    ... random_state=42)
+    >>> print("Original Dataset")
+    Original Dataset
+    >>> print(f"Samples: {X.shape[0]}")
+    Samples: 100
+    >>> for _class in range(y.shape[1]):
+    ...     print(f"Class {_class} count: {np.count_nonzero(y[:, _class])}")
+    Class 0 count: 30
+    Class 1 count: 54
+    Class 2 count: 48
+    Class 3 count: 33
+    Class 4 count: 14
+    >>> categorical_features = np.full((20,), True)
+    >>> mlsmote = MLSMOTE(categorical_features, random_state=42)
+    >>> X_res, y_res = mlsmote.fit_resample(X, y)
+    >>> print("Resampled Dataset")
+    Resampled Dataset
+    >>> print(f"Samples: {X_res.shape[0]}")
+    Samples: 114
+    >>> for _class in range(y_res.shape[1]):
+    ...     print(f"Class {_class} count: {np.count_nonzero(y_res[:, _class])}")
+    Class 0 count: 30
+    Class 1 count: 60
+    Class 2 count: 56
+    Class 3 count: 33
+    Class 4 count: 28
+    """
+
+    _required_parameters = ["categorical_features"]
+
+    INTERSECTION = "intersection"
+    RANKING = "ranking"
+    UNION = "union"
+    _sampling_strategies = [INTERSECTION, RANKING, UNION]
+
+    def __init__(
+        self,
+        categorical_features,
+        *,
+        sampling_strategy=RANKING,
+        random_state=None,
+        k_neighbors=5,
+    ):
+        if sampling_strategy not in MLSMOTE._sampling_strategies:
+            raise ValueError(
+                "Sampling Strategy can only be one of: 'ranking', 'union' or "
+                "'intersection'"
+            )
+
+        self.categorical_features = categorical_features
+        self.sampling_strategy_ = sampling_strategy
+        self.random_state = random_state
+        self.k_neighbors = k_neighbors
+
+    def _validate_estimator(self):
+        categorical_features = np.asarray(self.categorical_features)
+        if categorical_features.dtype.name == "bool":
+            self.categorical_features_ = np.flatnonzero(categorical_features)
+        else:
+            if any(
+                [cat not in np.arange(self.n_features_) for cat in categorical_features]
+            ):
+                raise ValueError(
+                    "Some of the categorical indices are out of range. Indices"
+                    f" should be between 0 and {self.n_features_}"
+                )
+            self.categorical_features_ = categorical_features
+        self.continuous_features_ = np.setdiff1d(
+            np.arange(self.n_features_), self.categorical_features_
+        )
+
+    def fit_resample(self, X, y):
+        """Resample the dataset.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe, sparse matrix} of shape \
+                (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : {array-like, sparse matrix of shape \
+                (n_samples, n_labels) or a list of lists of labels.
+            See "sklearn.datasets.make_multilabel_classification" and \
+                the "return_indicate" input parameter for more \
+                information on possible label sets formats.
+
+            Corresponding label sets for each sample in X. Sparse matrix \
+                should be of CSR format.
+
+        Returns
+        -------
+        X_resampled : {array-like, dataframe, sparse matrix} of shape \
+                (n_samples_new, n_features)
+            The array containing the resampled data.
+
+        y_resampled : array-like of shape (n_samples_new, n_labels) \
+                or a list of lists of labels.
+            The corresponding label sets of `X_resampled`.
+        """
+        self.n_features_ = X.shape[1]
+
+        self._validate_estimator()
+        random_state = check_random_state(self.random_state)
+
+        X_resampled = X.copy()
+
+        unique_labels = None
+        # Convert 'y' to a numpy array
+        if type(y) == sparse._csr.csr_matrix:
+            y_resampled = y.toarray()
+        elif type(y) == np.ndarray:
+            y_resampled = np.copy(y)
+        elif type(y) == list:
+            unique_labels = self._collect_unique_labels(y)
+            y_resampled = np.zeros((len(y), len(unique_labels)))
+            for i, sample_labels in enumerate(y):
+                for label in sample_labels:
+                    y_resampled[i, np.where(unique_labels == label)] = 1
+        else:
+            raise TypeError(
+                "'y' can only be of type 'numpy.ndarray', "
+                "'scipy.sparse._csr.csr_matrix' or 'list'"
+            )
+
+        self.n_classes_ = y_resampled.shape[1]
+
+        """TODO: Handle the case where 'mean_ir' is infinity. Happens when one label has
+        no samples
+        """
+        mean_ir = self._get_mean_imbalance_ratio(y_resampled)
+
+        for label in range(self.n_classes_):
+            irlbl_num = self._get_imbalance_ratio_numerator(y_resampled)
+            irlbl = self._get_imbalance_ratio_per_label(label, irlbl_num, y_resampled)
+            if irlbl > mean_ir:
+                min_bag = self._get_all_instances_of_label(label, y_resampled)
+                if (
+                    len(min_bag) <= 1
+                ):  # If there is only one sample, the neighbor set will be empty
+                    continue
+                # Note: Only the distance for numeric attributes can be
+                # cached. The Value Difference Metric (VDM) distance for
+                # categorical/nominal attributes CANNOT be cached because VDMs
+                # are dependent on the total number of samples in the dataset
+                # that have specific values for the different attributes.
+                # Given that each synthetic sample is added to the dataset in
+                # the inner loop (line 17 of 'Algorithm 1' of the MLSMOTE,
+                # Charte, F. et al. paper), the VDM between samples has to be
+                # computed in every inner iteration.
+                euclidean_dist_cache = np.zeros((y_resampled.shape[0], y_resampled.shape[0]))
+                X_cont = X_resampled[:][:, self.continuous_features_]
+                pairs = list(combinations(min_bag, 2))
+                for m, n in pairs:
+                    distance = sum(self._get_euclidean_distance(
+                        X_cont[m, :], X_cont[n, :]
+                    ))
+                    euclidean_dist_cache[m, n] = distance
+                    euclidean_dist_cache[n, m] = distance
+                for sample_id in min_bag:
+                    distances = self._calc_distances(
+                        sample_id, min_bag, X_resampled, y_resampled, euclidean_dist_cache,
+                    )
+                    distances = np.sort(distances, order="distance")
+                    neighbors = distances[
+                        1 : self.k_neighbors + 1
+                    ]  # Remove 'sample' from neighbor set
+                    ref_neigh = random_state.choice(neighbors, 1)[0]
+                    X_new, y_new = self._create_new_sample(
+                        sample_id,
+                        ref_neigh[1],
+                        [x[1] for x in neighbors],
+                        X_resampled,
+                        y_resampled,
+                        random_state,
+                    )
+                    X_resampled = np.vstack((X_resampled, X_new))
+                    y_resampled = np.vstack((y_resampled, y_new))
+        return X_resampled, self._convert_to_input_type(
+            y_resampled, unique_labels, type(y)
+        )
+
+    def _create_new_sample(
+        self,
+        sample_id,
+        ref_neigh_id,
+        neighbor_ids,
+        X_resampled,
+        y_resampled,
+        random_state,
+    ):
+        sample = X_resampled[sample_id]
+        synth_sample = np.zeros_like(sample)
+        ref_neigh = X_resampled[ref_neigh_id]
+
+        for i in range(synth_sample.shape[0]):
+            if i in self.continuous_features_:
+                diff = ref_neigh[i] - sample[i]
+                offset = diff * random_state.uniform(0, 1)
+                synth_sample[i] = sample[i] + offset
+            elif i in self.categorical_features_:
+                synth_sample[i] = self._get_most_frequent_value(
+                    X_resampled[neighbor_ids, i]
+                )
+
+        neighbors_labels = y_resampled[neighbor_ids]
+        label_counts = np.squeeze(
+            np.asarray(y_resampled[sample_id] + neighbors_labels.sum(axis=0))
+        )
+        synth_sample_labels = np.zeros((1, self.n_classes_), dtype=int)
+        if self.sampling_strategy_ == MLSMOTE.RANKING:
+            # Note: Paper states "present in half or more of the instances considered"
+            # but pseudocode shows: "labels lblCounts > (k + 1)/2" instead of '>='. We
+            # follow the pseudocode for now.
+            quorum = int((len(neighbor_ids) + 1) / 2)
+            chosen_labels = label_counts > quorum
+        elif self.sampling_strategy_ == MLSMOTE.UNION:
+            chosen_labels = label_counts.nonzero()
+        elif self.sampling_strategy_ == MLSMOTE.INTERSECTION:
+            chosen_labels = label_counts == len(neighbor_ids) + 1
+
+        synth_sample_labels[0, chosen_labels] = 1
+
+        return synth_sample, synth_sample_labels
+
+    def _collect_unique_labels(self, y):
+        """A support function that flattens the labelsets and return one set of unique
+        labels
+        """
+        return np.unique(np.array([label for label_set in y for label in label_set]))
+
+    def _calc_distances(self, sample, min_bag, features, labels, euclidean_dist_cache):
+        def calc_dist(bag_sample):
+            nominal_distance = sum(
+                [
+                    self._get_vdm(
+                        features[sample, cat],
+                        features[bag_sample, cat],
+                        features,
+                        cat,
+                        c_instances,
+                    )
+                    for cat in self.categorical_features_
+                ]
+            )
+            ordinal_distance = euclidean_dist_cache[sample, bag_sample]
+            dist = nominal_distance + ordinal_distance
+            return (dist, bag_sample)
+
+        c_instances = [
+            self._get_all_instances_of_label(_class, labels) for _class in range(self.n_classes_)
+        ]
+        distances = [calc_dist(bag_sample) for bag_sample in min_bag]
+        dtype = np.dtype([("distance", float), ("index", int)])
+        return np.array(distances, dtype=dtype)
+
+    def _get_euclidean_distance(self, first, second):
+        """Since the inputs are of type 'float' the euclidean distance is just
+        the absolute value of their difference.
+        """
+        return abs(first - second)
+
+    def _get_vdm(self, x_attr_val, y_attr_val, features, category, c_instances):
+        """A support function to compute the Value Difference Metric(VDM) described in
+        https://arxiv.org/pdf/cs/9701101.pdf
+        """
+
+        def f_sparse(_class):
+            N_axc = np.count_nonzero(features[c_instances[_class], category] == x_attr_val)
+            N_ayc = np.count_nonzero(features[c_instances[_class], category] == y_attr_val)
+            p = abs((N_axc / N_ax) - (N_ayc / N_ay)) ** 2
+            return p
+
+        N_ax = np.count_nonzero(features[:, category] == x_attr_val)
+        N_ay = np.count_nonzero(features[:, category] == y_attr_val)
+        vdm = sum([f_sparse(_class) for _class in range(self.n_classes_)])
+        return vdm
+
+    def _get_all_instances_of_label(self, label, labels):
+        return np.nonzero(labels[:, label])[0]
+
+    def _get_mean_imbalance_ratio(self, labels):
+        sum_per_label = np.array(
+            [self._sum_h(label, labels) for label in range(self.n_classes_)]
+        )
+        irlbl_num = sum_per_label.max()
+        ratio_sum = np.sum(irlbl_num / sum_per_label)
+        return ratio_sum / self.n_classes_
+
+    def _get_imbalance_ratio_numerator(self, labels):
+        sum_array = np.array(
+            [self._sum_h(label, labels) for label in range(self.n_classes_)]
+        )
+        return sum_array.max()
+
+    def _get_imbalance_ratio_per_label(self, label, irlbl_numerator, labels):
+        return irlbl_numerator / self._sum_h(label, labels)
+
+    def _sum_h(self, label, labels):
+        return np.count_nonzero(labels[:, label])
+
+    def _get_most_frequent_value(self, values):
+        """A support function to get most frequent value if a list of values
+        TODO: We might want to randomize 'unique' and 'counts' to avoid always returning
+        the first occurrence when multiple occurrences of the maximum value.
+        """
+        uniques, counts = np.unique(values, return_counts=True)
+        return uniques[np.argmax(counts)]
+
+    def _convert_to_input_type(self, y_resampled, unique_labels, input_type):
+        """A support function that converts the labels back to its input format"""
+        if input_type == sparse._csr.csr_matrix:
+            return sparse.csr_matrix(y_resampled, dtype=int)
+        elif input_type == np.ndarray:
+            return y_resampled
+        elif input_type == list:
+            labels = [[] for _ in range(y_resampled.shape[0])]
+            rows, cols = y_resampled.nonzero()
+            for row, col in zip(rows, cols):
+                labels[row].append(unique_labels[col])
+            return labels