From f8eab9a9d5ecf83ad86675954d310ce3471b731c Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 4 Jun 2022 18:28:00 +0200 Subject: [PATCH 01/56] Add support for empty levels --- hiclass/HierarchicalClassifier.py | 32 +++++++++++--- tests/test_LocalClassifierPerParentNode.py | 49 ++++++++++++++++++++++ 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index bf5633bc..d4411737 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -6,7 +6,6 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.linear_model import LogisticRegression -from sklearn.utils.validation import check_X_y class HierarchicalClassifier(abc.ABC): @@ -85,8 +84,11 @@ def fit(self, X, y): def _pre_fit(self, X, y): # Check that X and y have correct shape # and convert them to np.ndarray if need be + + leveled_y = self._make_leveled(y) + self.X_, self.y_ = self._validate_data( - X, y, multi_output=True, accept_sparse="csr" + X, leveled_y, multi_output=True, accept_sparse="csr" ) # Create and configure logger @@ -115,6 +117,22 @@ def _pre_fit(self, X, y): # Initialize local classifiers in DAG self._initialize_local_classifiers() + def _make_leveled(self, y): + # Add empty columns if column length differs + depth = 0 + for row in y: + try: + depth = max(depth, len(row)) + except TypeError: + return y + leveled_y = [] + for row in y: + new_row = [i for i in row] + while len(new_row) < depth: + new_row.append("") + leveled_y.append(new_row) + return leveled_y + def _create_logger(self): # Create logger self.logger_ = logging.getLogger(self.classifier_abbreviation) @@ -167,9 +185,13 @@ def _create_digraph(self): self.logger_.info(f"Creating digraph from {rows} 2D labels") for row in range(rows): for column in range(columns - 1): - self.hierarchy_.add_edge( - self.y_[row, column], self.y_[row, column + 1] - ) + # Only add edge if both parent and child are not empty + parent = self.y_[row, column].split(self.separator_)[-1] + child = self.y_[row, column + 1].split(self.separator_)[-1] + if parent != "" and child != "": + self.hierarchy_.add_edge( + self.y_[row, column], self.y_[row, column + 1] + ) elif self.y_.ndim == 1: # 1D labels diff --git a/tests/test_LocalClassifierPerParentNode.py b/tests/test_LocalClassifierPerParentNode.py index 4cf6e459..0f0cb757 100644 --- a/tests/test_LocalClassifierPerParentNode.py +++ b/tests/test_LocalClassifierPerParentNode.py @@ -183,3 +183,52 @@ def test_fit_predict(): lcppn.fit(x, y) predictions = lcppn.predict(x) assert_array_equal(y, predictions) + + +@pytest.fixture +def empty_levels(): + X = [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + y = [ + ["A", "1"], + ["A", "2"], + ["A", "C", "5"], + ["A", "C", "6"], + ["B", "3"], + ["B", "4"], + ] + return X, y + + +def test_empty_levels(empty_levels): + lcppn = LocalClassifierPerParentNode() + X, y = empty_levels + lcppn.fit(X, y) + predictions = lcppn.predict(X) + ground_truth = [ + ["A", "1", ""], + ["A", "2", ""], + ["A", "C", "5"], + ["A", "C", "6"], + ["B", "3", ""], + ["B", "4", ""], + ] + assert list(lcppn.hierarchy_.nodes) == [ + "A", + "A" + lcppn.separator_ + "1", + "A" + lcppn.separator_ + "2", + "A" + lcppn.separator_ + "C", + "A" + lcppn.separator_ + "C" + lcppn.separator_ + "5", + "A" + lcppn.separator_ + "C" + lcppn.separator_ + "6", + "B", + "B" + lcppn.separator_ + "3", + "B" + lcppn.separator_ + "4", + lcppn.root_, + ] + assert_array_equal(ground_truth, predictions) From 8d3092109312338cdc751ac4a84a2250e13b5898 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Sun, 5 Jun 2022 23:37:44 +0200 Subject: [PATCH 02/56] Update pypi download stats --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f2896271..54a7c2a8 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ HiClass is an open-source Python library for hierarchical classification compatible with scikit-learn. -[![Deploy PyPI](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml/badge.svg?event=push)](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml) [![Documentation Status](https://readthedocs.org/projects/hiclass/badge/?version=latest)](https://hiclass.readthedocs.io/en/latest/?badge=latest) [![codecov](https://codecov.io/gh/mirand863/hiclass/branch/main/graph/badge.svg?token=PR8VLBMMNR)](https://codecov.io/gh/mirand863/hiclass) [![Downloads Conda](https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda)](https://anaconda.org/conda-forge/hiclass) [![Downloads pypi](https://img.shields.io/pypi/dm/hiclass?label=pypi)](https://pypi.org/project/hiclass/) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![Deploy PyPI](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml/badge.svg?event=push)](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml) [![Documentation Status](https://readthedocs.org/projects/hiclass/badge/?version=latest)](https://hiclass.readthedocs.io/en/latest/?badge=latest) [![codecov](https://codecov.io/gh/mirand863/hiclass/branch/main/graph/badge.svg?token=PR8VLBMMNR)](https://codecov.io/gh/mirand863/hiclass) [![Downloads Conda](https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda)](https://anaconda.org/conda-forge/hiclass) [![Downloads PyPI](https://static.pepy.tech/personalized-badge/hiclass?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi)](https://pypi.org/project/hiclass/) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) ✨ Here is a **demo** that shows HiClass in action on hierarchical data: From 353c7433346ed78374c7ea21c3e06f7baeabf037 Mon Sep 17 00:00:00 2001 From: Fabio Date: Mon, 6 Jun 2022 02:02:29 +0200 Subject: [PATCH 03/56] Add test for empty levels to local classifier per node --- tests/test_LocalClassifierPerLevel.py | 49 +++++++++++++++++++++++++++ tests/test_LocalClassifierPerNode.py | 49 +++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py index 5089d86a..1187151b 100644 --- a/tests/test_LocalClassifierPerLevel.py +++ b/tests/test_LocalClassifierPerLevel.py @@ -133,3 +133,52 @@ def test_fit_predict(): pytest.fail(repr(e)) predictions = lcpl.predict(x) assert_array_equal(y, predictions) + + +@pytest.fixture +def empty_levels(): + X = [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + y = [ + ["A", "1"], + ["A", "2"], + ["A", "C", "5"], + ["A", "C", "6"], + ["B", "3"], + ["B", "4"], + ] + return X, y + + +# def test_empty_levels(empty_levels): +# lcppn = LocalClassifierPerLevel() +# X, y = empty_levels +# lcppn.fit(X, y) +# predictions = lcppn.predict(X) +# ground_truth = [ +# ["A", "1", ""], +# ["A", "2", ""], +# ["A", "C", "5"], +# ["A", "C", "6"], +# ["B", "3", ""], +# ["B", "4", ""], +# ] +# assert list(lcppn.hierarchy_.nodes) == [ +# "A", +# "A" + lcppn.separator_ + "1", +# "A" + lcppn.separator_ + "2", +# "A" + lcppn.separator_ + "C", +# "A" + lcppn.separator_ + "C" + lcppn.separator_ + "5", +# "A" + lcppn.separator_ + "C" + lcppn.separator_ + "6", +# "B", +# "B" + lcppn.separator_ + "3", +# "B" + lcppn.separator_ + "4", +# lcppn.root_, +# ] +# assert_array_equal(ground_truth, predictions) diff --git a/tests/test_LocalClassifierPerNode.py b/tests/test_LocalClassifierPerNode.py index 2eb21ea1..2ed37b0f 100644 --- a/tests/test_LocalClassifierPerNode.py +++ b/tests/test_LocalClassifierPerNode.py @@ -202,3 +202,52 @@ def test_fit_predict(): lcpn.fit(x, y) predictions = lcpn.predict(x) assert_array_equal(y, predictions) + + +@pytest.fixture +def empty_levels(): + X = [ + [1, 2], + [3, 4], + [5, 6], + [7, 8], + [9, 10], + [11, 12], + ] + y = [ + ["A", "1"], + ["A", "2"], + ["A", "C", "5"], + ["A", "C", "6"], + ["B", "3"], + ["B", "4"], + ] + return X, y + + +def test_empty_levels(empty_levels): + lcppn = LocalClassifierPerNode() + X, y = empty_levels + lcppn.fit(X, y) + predictions = lcppn.predict(X) + ground_truth = [ + ["A", "1", ""], + ["A", "2", ""], + ["A", "C", "5"], + ["A", "C", "6"], + ["B", "3", ""], + ["B", "4", ""], + ] + assert list(lcppn.hierarchy_.nodes) == [ + "A", + "A" + lcppn.separator_ + "1", + "A" + lcppn.separator_ + "2", + "A" + lcppn.separator_ + "C", + "A" + lcppn.separator_ + "C" + lcppn.separator_ + "5", + "A" + lcppn.separator_ + "C" + lcppn.separator_ + "6", + "B", + "B" + lcppn.separator_ + "3", + "B" + lcppn.separator_ + "4", + lcppn.root_, + ] + assert_array_equal(ground_truth, predictions) From 8c1c776c4170cf6e1416b9fdc1e1f76f0da194c0 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Mon, 6 Jun 2022 15:44:01 +0200 Subject: [PATCH 04/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 54a7c2a8..b8171abd 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ HiClass is an open-source Python library for hierarchical classification compatible with scikit-learn. -[![Deploy PyPI](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml/badge.svg?event=push)](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml) [![Documentation Status](https://readthedocs.org/projects/hiclass/badge/?version=latest)](https://hiclass.readthedocs.io/en/latest/?badge=latest) [![codecov](https://codecov.io/gh/mirand863/hiclass/branch/main/graph/badge.svg?token=PR8VLBMMNR)](https://codecov.io/gh/mirand863/hiclass) [![Downloads Conda](https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda)](https://anaconda.org/conda-forge/hiclass) [![Downloads PyPI](https://static.pepy.tech/personalized-badge/hiclass?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi)](https://pypi.org/project/hiclass/) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![Deploy PyPI](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml/badge.svg?event=push)](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml) [![Documentation Status](https://readthedocs.org/projects/hiclass/badge/?version=latest)](https://hiclass.readthedocs.io/en/latest/?badge=latest) [![codecov](https://codecov.io/gh/mirand863/hiclass/branch/main/graph/badge.svg?token=PR8VLBMMNR)](https://codecov.io/gh/mirand863/hiclass) [![Downloads PyPI](https://static.pepy.tech/personalized-badge/hiclass?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi)](https://pypi.org/project/hiclass/) [![Downloads Conda](https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda)](https://anaconda.org/conda-forge/hiclass) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) ✨ Here is a **demo** that shows HiClass in action on hierarchical data: From ebc5e0666d436a1947479a308eae15ce584dceb0 Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 7 Jun 2022 17:21:54 +0200 Subject: [PATCH 05/56] Replace for loops with list comprehension --- hiclass/HierarchicalClassifier.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index d4411737..c8c61b27 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -119,18 +119,11 @@ def _pre_fit(self, X, y): def _make_leveled(self, y): # Add empty columns if column length differs - depth = 0 - for row in y: - try: - depth = max(depth, len(row)) - except TypeError: - return y - leveled_y = [] - for row in y: - new_row = [i for i in row] - while len(new_row) < depth: - new_row.append("") - leveled_y.append(new_row) + try: + depth = max([len(row) for row in y]) + except TypeError: + return y + leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] return leveled_y def _create_logger(self): From 433ef86f8563a34e81cca4cbe4bd3cbab7788976 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 15 Jun 2022 14:40:28 +0200 Subject: [PATCH 06/56] Add docstring to _make_leveled function --- hiclass/HierarchicalClassifier.py | 32 +- tests/test_HierarchicalClassifier.py | 33 +- tests/test_LocalClassifierPerLevel.py | 362 +++++++-------- tests/test_LocalClassifierPerNode.py | 506 ++++++++++----------- tests/test_LocalClassifierPerParentNode.py | 468 +++++++++---------- 5 files changed, 723 insertions(+), 678 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index c8c61b27..ccecd671 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -8,6 +8,29 @@ from sklearn.linear_model import LogisticRegression +def _make_leveled(y): + """ + Add empty columns if column length differs. + If rows are not iterable, returns the current y without modifications. + + Parameters + ---------- + y : array-like of shape (n_samples, n_levels) + The target values, i.e., hierarchical class labels for classification. + + Returns + ------- + leveled_y : array-like of shape (n_samples, n_levels) + The leveled target values, i.e., hierarchical class labels for classification. + """ + try: + depth = max([len(row) for row in y]) + except TypeError: + return y + leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] + return leveled_y + + class HierarchicalClassifier(abc.ABC): """Abstract class for the local hierarchical classifiers. @@ -117,15 +140,6 @@ def _pre_fit(self, X, y): # Initialize local classifiers in DAG self._initialize_local_classifiers() - def _make_leveled(self, y): - # Add empty columns if column length differs - try: - depth = max([len(row) for row in y]) - except TypeError: - return y - leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] - return leveled_y - def _create_logger(self): # Create logger self.logger_ = logging.getLogger(self.classifier_abbreviation) diff --git a/tests/test_HierarchicalClassifier.py b/tests/test_HierarchicalClassifier.py index 241e2443..4d8377d8 100644 --- a/tests/test_HierarchicalClassifier.py +++ b/tests/test_HierarchicalClassifier.py @@ -1,12 +1,14 @@ import logging +import tempfile + import networkx as nx import numpy as np import pytest -import tempfile from numpy.testing import assert_array_equal from sklearn.linear_model import LogisticRegression from hiclass.HierarchicalClassifier import HierarchicalClassifier +from hiclass.HierarchicalClassifier import _make_leveled @pytest.fixture @@ -175,3 +177,32 @@ def test_clean_up(digraph_multiple_roots): assert digraph_multiple_roots.X_ is None with pytest.raises(AttributeError): assert digraph_multiple_roots.y_ is None + + +@pytest.fixture +def empty_levels(): + y = [ + ["a"], + ["b", "c"], + ["d", "e", "f"], + ] + return y + + +def test_make_leveled(empty_levels): + ground_truth = [ + ["a", "", ""], + ["b", "c", ""], + ["d", "e", "f"], + ] + assert ground_truth == _make_leveled(empty_levels) + + +@pytest.fixture +def noniterable_y(): + y = [1, 2, 3] + return y + + +def test_make_leveled_non_iterable_y(noniterable_y): + assert noniterable_y == _make_leveled(noniterable_y) diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py index 1187151b..779032c6 100644 --- a/tests/test_LocalClassifierPerLevel.py +++ b/tests/test_LocalClassifierPerLevel.py @@ -1,184 +1,184 @@ -import logging - -import networkx as nx -import numpy as np -import pytest -from numpy.testing import assert_array_equal -from scipy.sparse import csr_matrix -from sklearn.exceptions import NotFittedError -from sklearn.linear_model import LogisticRegression -from sklearn.utils.estimator_checks import parametrize_with_checks -from sklearn.utils.validation import check_is_fitted - -from hiclass import LocalClassifierPerLevel - - -@parametrize_with_checks([LocalClassifierPerLevel()]) -def test_sklearn_compatible_estimator(estimator, check): - check(estimator) - - -@pytest.fixture -def digraph_logistic_regression(): - digraph = LocalClassifierPerLevel(local_classifier=LogisticRegression()) - digraph.hierarchy_ = nx.DiGraph([("a", "b"), ("a", "c")]) - digraph.y_ = np.array([["a", "b"], ["a", "c"]]) - digraph.X_ = np.array([[1, 2], [3, 4]]) - digraph.logger_ = logging.getLogger("LCPL") - digraph.root_ = "a" - digraph.separator_ = "::HiClass::Separator::" - return digraph - - -def test_initialize_local_classifiers(digraph_logistic_regression): - digraph_logistic_regression._initialize_local_classifiers() - for classifier in digraph_logistic_regression.local_classifiers_: - assert isinstance( - classifier, - LogisticRegression, - ) - - -def test_fit_digraph(digraph_logistic_regression): - classifiers = [ - LogisticRegression(), - LogisticRegression(), - ] - digraph_logistic_regression.local_classifiers_ = classifiers - digraph_logistic_regression._fit_digraph() - for classifier in digraph_logistic_regression.local_classifiers_: - try: - check_is_fitted(classifier) - except NotFittedError as e: - pytest.fail(repr(e)) - assert 1 - - -def test_fit_digraph_parallel(digraph_logistic_regression): - classifiers = [ - LogisticRegression(), - LogisticRegression(), - ] - digraph_logistic_regression.n_jobs = 2 - digraph_logistic_regression.local_classifiers_ = classifiers - digraph_logistic_regression._fit_digraph_parallel(local_mode=True) - for classifier in digraph_logistic_regression.local_classifiers_: - try: - check_is_fitted(classifier) - except NotFittedError as e: - pytest.fail(repr(e)) - assert 1 - - -def test_fit_1_class(): - lcpl = LocalClassifierPerLevel(local_classifier=LogisticRegression(), n_jobs=2) - y = np.array([["1", "2"]]) - X = np.array([[1, 2]]) - ground_truth = np.array([["1", "2"]]) - lcpl.fit(X, y) - prediction = lcpl.predict(X) - assert_array_equal(ground_truth, prediction) - - -@pytest.fixture -def fitted_logistic_regression(): - digraph = LocalClassifierPerLevel(local_classifier=LogisticRegression()) - digraph.hierarchy_ = nx.DiGraph( - [("r", "1"), ("r", "2"), ("1", "1.1"), ("1", "1.2"), ("2", "2.1"), ("2", "2.2")] - ) - digraph.y_ = np.array([["1", "1.1"], ["1", "1.2"], ["2", "2.1"], ["2", "2.2"]]) - digraph.X_ = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - digraph.logger_ = logging.getLogger("LCPL") - digraph.max_levels_ = 2 - digraph.dtype_ = " Date: Wed, 15 Jun 2022 16:41:44 +0200 Subject: [PATCH 07/56] Filter empty leaves --- hiclass/HierarchicalClassifier.py | 23 +- hiclass/LocalClassifierPerLevel.py | 10 +- tests/test_LocalClassifierPerLevel.py | 368 +++++++-------- tests/test_LocalClassifierPerNode.py | 506 ++++++++++----------- tests/test_LocalClassifierPerParentNode.py | 468 +++++++++---------- 5 files changed, 698 insertions(+), 677 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index ccecd671..3a5897ec 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -10,8 +10,7 @@ def _make_leveled(y): """ - Add empty columns if column length differs. - If rows are not iterable, returns the current y without modifications. + Add empty cells if columns' length differs. Parameters ---------- @@ -22,6 +21,18 @@ def _make_leveled(y): ------- leveled_y : array-like of shape (n_samples, n_levels) The leveled target values, i.e., hierarchical class labels for classification. + + Notes + ----- + If rows are not iterable, returns the current y without modifications. + + Examples + -------- + >>> from hiclass.HierarchicalClassifier import _make_leveled + >>> y = [['a'], ['b', 'c']] + >>> leveled_y = _make_leveled(y) + >>> print(leveled_y) + >>> [['a', ''], ['b', 'c']] """ try: depth = max([len(row) for row in y]) @@ -108,7 +119,7 @@ def _pre_fit(self, X, y): # Check that X and y have correct shape # and convert them to np.ndarray if need be - leveled_y = self._make_leveled(y) + leveled_y = _make_leveled(y) self.X_, self.y_ = self._validate_data( X, leveled_y, multi_output=True, accept_sparse="csr" @@ -165,9 +176,11 @@ def _disambiguate(self): if self.y_.ndim == 2: new_y = [] for i in range(self.y_.shape[0]): - row = [self.y_[i, 0]] + row = [str(self.y_[i, 0])] for j in range(1, self.y_.shape[1]): - row.append(str(row[-1]) + self.separator_ + str(self.y_[i, j])) + parent = str(row[-1]) + child = str(self.y_[i, j]) + row.append(parent + self.separator_ + child) new_y.append(np.asarray(row, dtype=np.str_)) self.y_ = np.array(new_y) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 6c896afd..1361df00 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -8,7 +8,6 @@ import numpy as np import ray from sklearn.base import BaseEstimator -from sklearn.metrics import euclidean_distances from sklearn.utils.validation import check_array, check_is_fitted from hiclass.ConstantClassifier import ConstantClassifier @@ -207,6 +206,15 @@ def _fit_digraph(self): ) X = self.X_ y = self.y_[:, level] + + # Detect empty leaf nodes + leaves = np.array([str(i).split(self.separator_)[-1] for i in y]) + mask = leaves != "" + + # Remove rows with empty leaf nodes + X = X[mask] + y = y[mask] + unique_y = np.unique(y) if len(unique_y) == 1 and self.replace_classifiers: self.logger_.warning( diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py index 779032c6..90348a6a 100644 --- a/tests/test_LocalClassifierPerLevel.py +++ b/tests/test_LocalClassifierPerLevel.py @@ -1,184 +1,184 @@ -# import logging -# -# import networkx as nx -# import numpy as np -# import pytest -# from numpy.testing import assert_array_equal -# from scipy.sparse import csr_matrix -# from sklearn.exceptions import NotFittedError -# from sklearn.linear_model import LogisticRegression -# from sklearn.utils.estimator_checks import parametrize_with_checks -# from sklearn.utils.validation import check_is_fitted -# -# from hiclass import LocalClassifierPerLevel -# -# -# @parametrize_with_checks([LocalClassifierPerLevel()]) -# def test_sklearn_compatible_estimator(estimator, check): -# check(estimator) -# -# -# @pytest.fixture -# def digraph_logistic_regression(): -# digraph = LocalClassifierPerLevel(local_classifier=LogisticRegression()) -# digraph.hierarchy_ = nx.DiGraph([("a", "b"), ("a", "c")]) -# digraph.y_ = np.array([["a", "b"], ["a", "c"]]) -# digraph.X_ = np.array([[1, 2], [3, 4]]) -# digraph.logger_ = logging.getLogger("LCPL") -# digraph.root_ = "a" -# digraph.separator_ = "::HiClass::Separator::" -# return digraph -# -# -# def test_initialize_local_classifiers(digraph_logistic_regression): -# digraph_logistic_regression._initialize_local_classifiers() -# for classifier in digraph_logistic_regression.local_classifiers_: -# assert isinstance( -# classifier, -# LogisticRegression, -# ) -# -# -# def test_fit_digraph(digraph_logistic_regression): -# classifiers = [ -# LogisticRegression(), -# LogisticRegression(), -# ] -# digraph_logistic_regression.local_classifiers_ = classifiers -# digraph_logistic_regression._fit_digraph() -# for classifier in digraph_logistic_regression.local_classifiers_: -# try: -# check_is_fitted(classifier) -# except NotFittedError as e: -# pytest.fail(repr(e)) -# assert 1 -# -# -# def test_fit_digraph_parallel(digraph_logistic_regression): -# classifiers = [ -# LogisticRegression(), -# LogisticRegression(), -# ] -# digraph_logistic_regression.n_jobs = 2 -# digraph_logistic_regression.local_classifiers_ = classifiers -# digraph_logistic_regression._fit_digraph_parallel(local_mode=True) -# for classifier in digraph_logistic_regression.local_classifiers_: -# try: -# check_is_fitted(classifier) -# except NotFittedError as e: -# pytest.fail(repr(e)) -# assert 1 -# -# -# def test_fit_1_class(): -# lcpl = LocalClassifierPerLevel(local_classifier=LogisticRegression(), n_jobs=2) -# y = np.array([["1", "2"]]) -# X = np.array([[1, 2]]) -# ground_truth = np.array([["1", "2"]]) -# lcpl.fit(X, y) -# prediction = lcpl.predict(X) -# assert_array_equal(ground_truth, prediction) -# -# -# @pytest.fixture -# def fitted_logistic_regression(): -# digraph = LocalClassifierPerLevel(local_classifier=LogisticRegression()) -# digraph.hierarchy_ = nx.DiGraph( -# [("r", "1"), ("r", "2"), ("1", "1.1"), ("1", "1.2"), ("2", "2.1"), ("2", "2.2")] -# ) -# digraph.y_ = np.array([["1", "1.1"], ["1", "1.2"], ["2", "2.1"], ["2", "2.2"]]) -# digraph.X_ = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) -# digraph.logger_ = logging.getLogger("LCPL") -# digraph.max_levels_ = 2 -# digraph.dtype_ = " Date: Wed, 15 Jun 2022 17:18:45 +0200 Subject: [PATCH 08/56] Fix some tests --- hiclass/LocalClassifierPerLevel.py | 9 +++++++-- tests/test_LocalClassifierPerLevel.py | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 1361df00..62f45d8a 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -164,6 +164,7 @@ def predict(self, X): highest_probabilities = [ np.argmax(probabilities[i], axis=0) for i in range(len(probabilities)) + if len(probabilities[i] > 0) ] classes = np.array( [ @@ -172,13 +173,13 @@ def predict(self, X): ], dtype=object, ) - predictions = np.array( + classes = classes[self.masks_[level]] + y[self.masks_[level], level] = np.array( [ classes[i][highest_probabilities[i]] for i in range(len(highest_probabilities)) ] ) - y[:, level] = predictions # Convert back to 1D if there is only 1 column to pass all sklearn's checks if self.max_levels_ == 1: @@ -197,6 +198,7 @@ def _initialize_local_classifiers(self): self.local_classifiers_ = [ deepcopy(self.local_classifier_) for _ in range(self.y_.shape[1]) ] + self.masks_ = [None for _ in range(self.y_.shape[1])] def _fit_digraph(self): self.logger_.info("Fitting local classifiers") @@ -215,6 +217,9 @@ def _fit_digraph(self): X = X[mask] y = y[mask] + # Store mask for current level + self.masks_[level] = mask + unique_y = np.unique(y) if len(unique_y) == 1 and self.replace_classifiers: self.logger_.warning( diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py index 90348a6a..b1aa78fa 100644 --- a/tests/test_LocalClassifierPerLevel.py +++ b/tests/test_LocalClassifierPerLevel.py @@ -27,6 +27,10 @@ def digraph_logistic_regression(): digraph.logger_ = logging.getLogger("LCPL") digraph.root_ = "a" digraph.separator_ = "::HiClass::Separator::" + digraph.masks_ = [ + [True, True], + [True, True], + ] return digraph @@ -93,6 +97,10 @@ def fitted_logistic_regression(): digraph.dtype_ = " Date: Wed, 15 Jun 2022 17:33:31 +0200 Subject: [PATCH 09/56] Fix last failing test --- hiclass/LocalClassifierPerLevel.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 62f45d8a..0a24cdfa 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -15,15 +15,21 @@ @ray.remote -def _parallel_fit(lcpl, level): +def _parallel_fit(lcpl, level, separator): classifier = lcpl.local_classifiers_[level] X = lcpl.X_ y = lcpl.y_[:, level] + # Detect empty leaf nodes + leaves = np.array([str(i).split(separator)[-1] for i in y]) + mask = leaves != "" + # Remove rows with empty leaf nodes + X = X[mask] + y = y[mask] unique_y = np.unique(y) if len(unique_y) == 1 and lcpl.replace_classifiers: classifier = ConstantClassifier() classifier.fit(X, y) - return classifier + return mask, classifier class LocalClassifierPerLevel(BaseEstimator, HierarchicalClassifier): @@ -234,9 +240,10 @@ def _fit_digraph_parallel(self, local_mode: bool = False): ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) lcpl = ray.put(self) results = [ - _parallel_fit.remote(lcpl, level) + _parallel_fit.remote(lcpl, level, self.separator_) for level in range(len(self.local_classifiers_)) ] classifiers = ray.get(results) - for level, classifier in enumerate(classifiers): + for level, (mask, classifier) in enumerate(classifiers): + self.masks_[level] = mask self.local_classifiers_[level] = classifier From 90686c6baebfd25ab19e981f54706515a809332b Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 15:57:31 +0200 Subject: [PATCH 10/56] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b8171abd..4f9ac04f 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,8 @@ Please reach out to fabio.malchermiranda@hpi.de. We are a small team on a mission to democratize hierarchical classification, and we'll take all the help we can get! If you'd like to get involved, here's information on [contribution guidelines and how to test the code locally](https://github.com/mirand863/hiclass/blob/main/CONTRIBUTING.md). +You can contribute in multiple ways, e.g., reporting bugs, writing or translating documentation, reviewing or refactoring code, implementing features, etc. + ## Getting the latest updates If you'd like to get updates when we release new versions, please click on the "Watch" button on the top and select "Releases only". Github will then send you notifications along with a changelog with each new release. From 7ec06aff4ccc3b7f914cb08aadfa819bd63f05a0 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 15:59:06 +0200 Subject: [PATCH 11/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4f9ac04f..7dd57b53 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ Please reach out to fabio.malchermiranda@hpi.de. We are a small team on a mission to democratize hierarchical classification, and we'll take all the help we can get! If you'd like to get involved, here's information on [contribution guidelines and how to test the code locally](https://github.com/mirand863/hiclass/blob/main/CONTRIBUTING.md). -You can contribute in multiple ways, e.g., reporting bugs, writing or translating documentation, reviewing or refactoring code, implementing features, etc. +You can contribute in multiple ways, e.g., reporting bugs, writing or translating documentation, reviewing or refactoring code, requesting or implementing new features, etc. ## Getting the latest updates From dd5556b4b6116ad0a1e026512a97a7b7c6a7388a Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 16:00:56 +0200 Subject: [PATCH 12/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7dd57b53..c0da0493 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ Please reach out to fabio.malchermiranda@hpi.de. ## Contributing -We are a small team on a mission to democratize hierarchical classification, and we'll take all the help we can get! If you'd like to get involved, here's information on [contribution guidelines and how to test the code locally](https://github.com/mirand863/hiclass/blob/main/CONTRIBUTING.md). +We are a small team on a mission to democratize hierarchical classification, and we will take all the help we can get! If you would like to get involved, here is information on [contribution guidelines and how to test the code locally](https://github.com/mirand863/hiclass/blob/main/CONTRIBUTING.md). You can contribute in multiple ways, e.g., reporting bugs, writing or translating documentation, reviewing or refactoring code, requesting or implementing new features, etc. From 2873861d15d7e096b4cb9c63bce2c34dd6f7f8c8 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 16:03:49 +0200 Subject: [PATCH 13/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c0da0493..14cc9c7b 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ HiClass is an open-source Python library for hierarchical classification compati - **Pandas Series and DataFrames:** If you prefer to use pandas, that is not an issue as HiClass also works with Pandas. - **Sparse matrices:** HiClass also supports features (X_train and X_test) built with sparse matrices, both for training and predicting, which can save you heaps of memory. - **Parallel training:** Training can be performed in parallel on the hierarchical classifiers, which allows parallelization regardless of the implementations available on scikit-learn. -- **Build pipelines:** Since the hierarchical classifiers inherit from the BaseEstimator of scikit-learn, pipelines can be built to automate machine learning workflows. +- **Building pipelines:** Since the hierarchical classifiers inherit from the BaseEstimator of scikit-learn, pipelines can be built to automate machine learning workflows. - **Hierarchical metrics:** HiClass supports the computation of hierarchical precision, recall and f-score, which are more appropriate for hierarchical data than traditional metrics. - **Compatible with pickle:** Easily store trained models on disk for future use. From f829e79cad436951c0e0ae258eb75f5615fee059 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 16:04:22 +0200 Subject: [PATCH 14/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 14cc9c7b..c0da0493 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ HiClass is an open-source Python library for hierarchical classification compati - **Pandas Series and DataFrames:** If you prefer to use pandas, that is not an issue as HiClass also works with Pandas. - **Sparse matrices:** HiClass also supports features (X_train and X_test) built with sparse matrices, both for training and predicting, which can save you heaps of memory. - **Parallel training:** Training can be performed in parallel on the hierarchical classifiers, which allows parallelization regardless of the implementations available on scikit-learn. -- **Building pipelines:** Since the hierarchical classifiers inherit from the BaseEstimator of scikit-learn, pipelines can be built to automate machine learning workflows. +- **Build pipelines:** Since the hierarchical classifiers inherit from the BaseEstimator of scikit-learn, pipelines can be built to automate machine learning workflows. - **Hierarchical metrics:** HiClass supports the computation of hierarchical precision, recall and f-score, which are more appropriate for hierarchical data than traditional metrics. - **Compatible with pickle:** Easily store trained models on disk for future use. From 3a534100ad837485efbe9ff3ec9814699319c52a Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 16:06:01 +0200 Subject: [PATCH 15/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c0da0493..3336f1e2 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ HiClass is an open-source Python library for hierarchical classification compati - **Hierarchical metrics:** HiClass supports the computation of hierarchical precision, recall and f-score, which are more appropriate for hierarchical data than traditional metrics. - **Compatible with pickle:** Easily store trained models on disk for future use. -**Don't see a feature on this list?** Search our [issue tracker](https://github.com/mirand863/hiclass/issues) if someone has already requested it and add a comment to it explaining your use-case, or open a new issue if not. We prioritize our roadmap based on user feedback, so we'd love to hear from you. +**Any feature missing on this list?** Search our [issue tracker](https://github.com/mirand863/hiclass/issues) to see if someone has already requested it and add a comment to it explaining your use-case. Otherwise, please open a new issue describing the requested feature and possible use-case scenario. We prioritize our roadmap based on user feedback, so we would love to hear from you. ## Benchmarks From 20afce319e5e3432ad858e5ab5ff831116aac203 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 16 Jun 2022 16:07:49 +0200 Subject: [PATCH 16/56] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3336f1e2..976bc7eb 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ We would love to benchmark with larger datasets, if we can find them in the publ Here is our public roadmap: https://github.com/mirand863/hiclass/projects/1. -We do Just-In-Time planning, and we tend to reprioritize based on your feedback. Hence, items you see on this roadmap are subject to change. We prioritize features based on the number of people asking for it, features/fixes that are small enough and can be addressed while we work on other related features, features/fixes that help improve stability & relevance and features that address interesting use cases that excite us! If you'd like to have a request prioritized, we ask that you add a detailed use-case for it, either as a comment on an existing issue (besides a thumbs-up) or in a new issue. The detailed context helps. +We do Just-In-Time planning, and we tend to reprioritize based on your feedback. Hence, items you see on this roadmap are subject to change. We prioritize features based on the number of people asking for it, features/fixes that are small enough and can be addressed while we work on other related features, features/fixes that help improve stability & relevance and features that address interesting use cases that excite us! If you would like to have a request prioritized, we ask that you add a detailed use-case for it, either as a comment on an existing issue (besides a thumbs-up) or in a new issue. The detailed context helps. ## Who is using HiClass? From 5dafba03aa304f92e0b8d34e10a121e23fecbce0 Mon Sep 17 00:00:00 2001 From: Fabio Date: Fri, 24 Jun 2022 23:01:17 +0200 Subject: [PATCH 17/56] Refactor create digraph method --- hiclass/HierarchicalClassifier.py | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 3a5897ec..7ceb1162 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -191,43 +191,45 @@ def _create_digraph(self): # Save dtype of y_ self.dtype_ = self.y_.dtype - # 1D disguised as 2D + self._create_digraph_1d() + + self._create_digraph_2d() + + if self.y_.ndim > 2: + # Unsuported dimension + self.logger_.error(f"y with {self.y_.ndim} dimensions detected") + raise ValueError( + f"Creating graph from y with {self.y_.ndim} dimensions is not supported" + ) + + def _create_digraph_1d(self): + # Flatten 1D disguised as 2D if self.y_.ndim == 2 and self.y_.shape[1] == 1: self.logger_.info("Converting y to 1D") self.y_ = self.y_.flatten() + if self.y_.ndim == 1: + # Create max_levels_ variable + self.max_levels_ = 1 + self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels") + for label in self.y_: + self.hierarchy_.add_node(label) - # Check dimension of labels + def _create_digraph_2d(self): if self.y_.ndim == 2: - # 2D labels # Create max_levels variable self.max_levels_ = self.y_.shape[1] rows, columns = self.y_.shape self.logger_.info(f"Creating digraph from {rows} 2D labels") for row in range(rows): for column in range(columns - 1): - # Only add edge if both parent and child are not empty parent = self.y_[row, column].split(self.separator_)[-1] child = self.y_[row, column + 1].split(self.separator_)[-1] if parent != "" and child != "": + # Only add edge if both parent and child are not empty self.hierarchy_.add_edge( self.y_[row, column], self.y_[row, column + 1] ) - elif self.y_.ndim == 1: - # 1D labels - # Create max_levels_ variable - self.max_levels_ = 1 - self.logger_.info(f"Creating digraph from {self.y_.size} 1D labels") - for label in self.y_: - self.hierarchy_.add_node(label) - - else: - # Unsuported dimension - self.logger_.error(f"y with {self.y_.ndim} dimensions detected") - raise ValueError( - f"Creating graph from y with {self.y_.ndim} dimensions is not supported" - ) - def _export_digraph(self): # Check if edge_list is set if self.edge_list: From 42a76073658578910fbc85a849ebea4df96efe9b Mon Sep 17 00:00:00 2001 From: Fabio Date: Fri, 24 Jun 2022 23:06:10 +0200 Subject: [PATCH 18/56] Remove comments --- hiclass/LocalClassifierPerLevel.py | 2 -- hiclass/LocalClassifierPerNode.py | 2 -- hiclass/LocalClassifierPerParentNode.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index dc744798..414b0d97 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -107,8 +107,6 @@ def fit(self, X, y): # TODO: Add parameter to receive hierarchy as parameter in constructor - # TODO: Add support to empty labels in some levels - # Return the classifier return self diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index 78a5a3f9..eb28964b 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -109,8 +109,6 @@ def fit(self, X, y): # TODO: Add parameter to receive hierarchy as parameter in constructor - # TODO: Add support to empty labels in some levels - # Return the classifier return self diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 72ac398e..f1b38721 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -102,8 +102,6 @@ def fit(self, X, y): # TODO: Add parameter to receive hierarchy as parameter in constructor - # TODO: Add support to empty labels in some levels - # Return the classifier return self From a153cfe5c8bc5a3e2e98c30350d6969b908ad2f3 Mon Sep 17 00:00:00 2001 From: Fabio Date: Fri, 24 Jun 2022 23:52:38 +0200 Subject: [PATCH 19/56] Fix bug --- hiclass/HierarchicalClassifier.py | 8 ++++---- tests/test_HierarchicalClassifier.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 7ceb1162..e4bc8be3 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -39,7 +39,7 @@ def _make_leveled(y): except TypeError: return y leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] - return leveled_y + return np.array(leveled_y) class HierarchicalClassifier(abc.ABC): @@ -119,12 +119,12 @@ def _pre_fit(self, X, y): # Check that X and y have correct shape # and convert them to np.ndarray if need be - leveled_y = _make_leveled(y) - self.X_, self.y_ = self._validate_data( - X, leveled_y, multi_output=True, accept_sparse="csr" + X, y, multi_output=True, accept_sparse="csr" ) + self.y_ = _make_leveled(self.y_) + # Create and configure logger self._create_logger() diff --git a/tests/test_HierarchicalClassifier.py b/tests/test_HierarchicalClassifier.py index 4d8377d8..e381b07c 100644 --- a/tests/test_HierarchicalClassifier.py +++ b/tests/test_HierarchicalClassifier.py @@ -190,12 +190,15 @@ def empty_levels(): def test_make_leveled(empty_levels): - ground_truth = [ - ["a", "", ""], - ["b", "c", ""], - ["d", "e", "f"], - ] - assert ground_truth == _make_leveled(empty_levels) + ground_truth = np.array( + [ + ["a", "", ""], + ["b", "c", ""], + ["d", "e", "f"], + ] + ) + result = _make_leveled(empty_levels) + assert_array_equal(ground_truth, result) @pytest.fixture From 5fd0253265cc5e86c2dd6a8bf5fdb2a0970445f6 Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 00:04:07 +0200 Subject: [PATCH 20/56] Simplify example --- docs/examples/plot_parallel_training.py | 26 ++++++------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/docs/examples/plot_parallel_training.py b/docs/examples/plot_parallel_training.py index e90712af..25ad13db 100644 --- a/docs/examples/plot_parallel_training.py +++ b/docs/examples/plot_parallel_training.py @@ -25,29 +25,15 @@ from hiclass import LocalClassifierPerParentNode -def download(url: str, path: str) -> None: - """ - Download a file from the internet. - - Parameters - ---------- - url : str - The address of the file to be downloaded. - path : str - The path to store the downloaded file. - """ - response = requests.get(url) - with open(path, "wb") as file: - file.write(response.content) - - # Download training data -training_data_url = "https://zenodo.org/record/6657410/files/train_40k.csv?download=1" -training_data_path = "train_40k.csv" -download(training_data_url, training_data_path) +url = "https://zenodo.org/record/6657410/files/train_40k.csv?download=1" +path = "train_40k.csv" +response = requests.get(url) +with open(path, "wb") as file: + file.write(response.content) # Load training data into pandas dataframe -training_data = pd.read_csv(training_data_path).fillna(" ") +training_data = pd.read_csv(path).fillna(" ") # We will use logistic regression classifiers for every parent node lr = LogisticRegression(max_iter=1000) From b5d1cd035c87e5f5ee0dc8b87372dd257ebfe30c Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 00:13:40 +0200 Subject: [PATCH 21/56] Add n_jobs to introduction --- docs/examples/plot_parallel_training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/examples/plot_parallel_training.py b/docs/examples/plot_parallel_training.py index 25ad13db..39b25d34 100644 --- a/docs/examples/plot_parallel_training.py +++ b/docs/examples/plot_parallel_training.py @@ -8,7 +8,8 @@ While by default the models in HiClass are trained using a single core, it is possible to train each local classifier in parallel by leveraging the library Ray [1]_. In this example, we demonstrate how to train a hierarchical classifier in parallel, -using all the cores available, on a mock dataset from Kaggle [2]_. +setting the parameter :literal:`n_jobs` to use all the cores available. Training +is performed on a mock dataset from Kaggle [2]_. .. [1] https://www.ray.io/ .. [2] https://www.kaggle.com/datasets/kashnitsky/hierarchical-text-classification From 13c11e91a5b461b703af3bef5ec9dd32382a4945 Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 00:23:55 +0200 Subject: [PATCH 22/56] Remove comma --- docs/examples/plot_parallel_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/examples/plot_parallel_training.py b/docs/examples/plot_parallel_training.py index 39b25d34..6ecbab09 100644 --- a/docs/examples/plot_parallel_training.py +++ b/docs/examples/plot_parallel_training.py @@ -7,7 +7,7 @@ Larger datasets require more time for training. While by default the models in HiClass are trained using a single core, it is possible to train each local classifier in parallel by leveraging the library Ray [1]_. -In this example, we demonstrate how to train a hierarchical classifier in parallel, +In this example, we demonstrate how to train a hierarchical classifier in parallel by setting the parameter :literal:`n_jobs` to use all the cores available. Training is performed on a mock dataset from Kaggle [2]_. From 4affeb756227e7da76e44e5d6f7f9e1b4763326e Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 00:38:54 +0200 Subject: [PATCH 23/56] Expand table of contents --- docs/examples/README.rst | 5 ++++- docs/source/index.rst | 9 +++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/examples/README.rst b/docs/examples/README.rst index 90ce2230..7cc24de4 100644 --- a/docs/examples/README.rst +++ b/docs/examples/README.rst @@ -1,4 +1,7 @@ Gallery of Examples =================== -These examples illustrate the main features of HiClass. \ No newline at end of file +These examples illustrate the main features of HiClass. + +.. toctree:: + :hidden: diff --git a/docs/source/index.rst b/docs/source/index.rst index 443dd6a2..79cd177e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -31,14 +31,11 @@ Welcome to hiclass' documentation! :alt: License .. toctree:: - :titlesonly: + :includehidden: + :maxdepth: 3 introduction/index get_started/index auto_examples/index algorithms/index - -.. toctree:: - :maxdepth: 3 - - api/index + api/index From 31abb680b4ca6bf92dd7da91ff1d444b74a52f2a Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 01:05:10 +0200 Subject: [PATCH 24/56] Add example for empty levels --- docs/examples/plot_empty_levels.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 docs/examples/plot_empty_levels.py diff --git a/docs/examples/plot_empty_levels.py b/docs/examples/plot_empty_levels.py new file mode 100644 index 00000000..6c8ce6ad --- /dev/null +++ b/docs/examples/plot_empty_levels.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +""" +========================== +Different Number of Levels +========================== + +HiClass supports different number of levels in the hierarchy. +For this example, we will train a local classifier per node +with the following hierarchy: + +.. figure:: ../algorithms/local_classifier_per_node.svg + :align: center +""" +from sklearn.linear_model import LogisticRegression + +from hiclass import LocalClassifierPerNode + +# Define data +X_train = [[1], [2], [3], [4]] +X_test = [[4], [3], [2], [1]] +Y_train = [ + ["Reptile", "Snake"], + ["Reptile", "Lizard"], + ["Mammal", "Cat"], + ["Mammal", "Wolf", "Dog"], +] + +# Use random forest classifiers for every node +rf = LogisticRegression() +classifier = LocalClassifierPerNode(local_classifier=rf) + +# Train local classifier per node +classifier.fit(X_train, Y_train) + +# Predict +predictions = classifier.predict(X_test) +print(predictions) From b06068325378f8e55acf57400bb52786216e66c3 Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 01:29:39 +0200 Subject: [PATCH 25/56] Update tests to cover 1 column only --- tests/test_LocalClassifierPerLevel.py | 42 ++++++++-------------- tests/test_LocalClassifierPerNode.py | 42 ++++++++-------------- tests/test_LocalClassifierPerParentNode.py | 42 ++++++++-------------- 3 files changed, 45 insertions(+), 81 deletions(-) diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py index b1aa78fa..43f5aabe 100644 --- a/tests/test_LocalClassifierPerLevel.py +++ b/tests/test_LocalClassifierPerLevel.py @@ -146,20 +146,14 @@ def test_fit_predict(): @pytest.fixture def empty_levels(): X = [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [1], + [2], + [3], ] y = [ - ["A", "1"], - ["A", "2"], - ["A", "C", "5"], - ["A", "C", "6"], - ["B", "3"], - ["B", "4"], + ["1"], + ["2", "2.1"], + ["3", "3.1", "3.1.2"], ] return X, y @@ -170,23 +164,17 @@ def test_empty_levels(empty_levels): lcppn.fit(X, y) predictions = lcppn.predict(X) ground_truth = [ - ["A", "1", ""], - ["A", "2", ""], - ["A", "C", "5"], - ["A", "C", "6"], - ["B", "3", ""], - ["B", "4", ""], + ["1", "", ""], + ["2", "2.1", ""], + ["3", "3.1", "3.1.2"], ] assert list(lcppn.hierarchy_.nodes) == [ - "A", - "A" + lcppn.separator_ + "1", - "A" + lcppn.separator_ + "2", - "A" + lcppn.separator_ + "C", - "A" + lcppn.separator_ + "C" + lcppn.separator_ + "5", - "A" + lcppn.separator_ + "C" + lcppn.separator_ + "6", - "B", - "B" + lcppn.separator_ + "3", - "B" + lcppn.separator_ + "4", + "1", + "2", + "2" + lcppn.separator_ + "2.1", + "3", + "3" + lcppn.separator_ + "3.1", + "3" + lcppn.separator_ + "3.1" + lcppn.separator_ + "3.1.2", lcppn.root_, ] assert_array_equal(ground_truth, predictions) diff --git a/tests/test_LocalClassifierPerNode.py b/tests/test_LocalClassifierPerNode.py index 2ed37b0f..9a51deeb 100644 --- a/tests/test_LocalClassifierPerNode.py +++ b/tests/test_LocalClassifierPerNode.py @@ -207,20 +207,14 @@ def test_fit_predict(): @pytest.fixture def empty_levels(): X = [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [1], + [2], + [3], ] y = [ - ["A", "1"], - ["A", "2"], - ["A", "C", "5"], - ["A", "C", "6"], - ["B", "3"], - ["B", "4"], + ["1"], + ["2", "2.1"], + ["3", "3.1", "3.1.2"], ] return X, y @@ -231,23 +225,17 @@ def test_empty_levels(empty_levels): lcppn.fit(X, y) predictions = lcppn.predict(X) ground_truth = [ - ["A", "1", ""], - ["A", "2", ""], - ["A", "C", "5"], - ["A", "C", "6"], - ["B", "3", ""], - ["B", "4", ""], + ["1", "", ""], + ["2", "2.1", ""], + ["3", "3.1", "3.1.2"], ] assert list(lcppn.hierarchy_.nodes) == [ - "A", - "A" + lcppn.separator_ + "1", - "A" + lcppn.separator_ + "2", - "A" + lcppn.separator_ + "C", - "A" + lcppn.separator_ + "C" + lcppn.separator_ + "5", - "A" + lcppn.separator_ + "C" + lcppn.separator_ + "6", - "B", - "B" + lcppn.separator_ + "3", - "B" + lcppn.separator_ + "4", + "1", + "2", + "2" + lcppn.separator_ + "2.1", + "3", + "3" + lcppn.separator_ + "3.1", + "3" + lcppn.separator_ + "3.1" + lcppn.separator_ + "3.1.2", lcppn.root_, ] assert_array_equal(ground_truth, predictions) diff --git a/tests/test_LocalClassifierPerParentNode.py b/tests/test_LocalClassifierPerParentNode.py index 0f0cb757..80837f95 100644 --- a/tests/test_LocalClassifierPerParentNode.py +++ b/tests/test_LocalClassifierPerParentNode.py @@ -188,20 +188,14 @@ def test_fit_predict(): @pytest.fixture def empty_levels(): X = [ - [1, 2], - [3, 4], - [5, 6], - [7, 8], - [9, 10], - [11, 12], + [1], + [2], + [3], ] y = [ - ["A", "1"], - ["A", "2"], - ["A", "C", "5"], - ["A", "C", "6"], - ["B", "3"], - ["B", "4"], + ["1"], + ["2", "2.1"], + ["3", "3.1", "3.1.2"], ] return X, y @@ -212,23 +206,17 @@ def test_empty_levels(empty_levels): lcppn.fit(X, y) predictions = lcppn.predict(X) ground_truth = [ - ["A", "1", ""], - ["A", "2", ""], - ["A", "C", "5"], - ["A", "C", "6"], - ["B", "3", ""], - ["B", "4", ""], + ["1", "", ""], + ["2", "2.1", ""], + ["3", "3.1", "3.1.2"], ] assert list(lcppn.hierarchy_.nodes) == [ - "A", - "A" + lcppn.separator_ + "1", - "A" + lcppn.separator_ + "2", - "A" + lcppn.separator_ + "C", - "A" + lcppn.separator_ + "C" + lcppn.separator_ + "5", - "A" + lcppn.separator_ + "C" + lcppn.separator_ + "6", - "B", - "B" + lcppn.separator_ + "3", - "B" + lcppn.separator_ + "4", + "1", + "2", + "2" + lcppn.separator_ + "2.1", + "3", + "3" + lcppn.separator_ + "3.1", + "3" + lcppn.separator_ + "3.1" + lcppn.separator_ + "3.1.2", lcppn.root_, ] assert_array_equal(ground_truth, predictions) From 175786982e13282168f728c5fcfb80fccb949a87 Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 01:30:43 +0200 Subject: [PATCH 26/56] Update tests to cover 1 column only --- hiclass/HierarchicalClassifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index e4bc8be3..62fa7294 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -222,6 +222,7 @@ def _create_digraph_2d(self): self.logger_.info(f"Creating digraph from {rows} 2D labels") for row in range(rows): for column in range(columns - 1): + print(column) parent = self.y_[row, column].split(self.separator_)[-1] child = self.y_[row, column + 1].split(self.separator_)[-1] if parent != "" and child != "": From 234483198761814e29d33fda4e30bac09008eddf Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 01:36:45 +0200 Subject: [PATCH 27/56] Fix lcpn and lcppn --- hiclass/HierarchicalClassifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 62fa7294..142f3574 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -222,7 +222,6 @@ def _create_digraph_2d(self): self.logger_.info(f"Creating digraph from {rows} 2D labels") for row in range(rows): for column in range(columns - 1): - print(column) parent = self.y_[row, column].split(self.separator_)[-1] child = self.y_[row, column + 1].split(self.separator_)[-1] if parent != "" and child != "": @@ -230,6 +229,8 @@ def _create_digraph_2d(self): self.hierarchy_.add_edge( self.y_[row, column], self.y_[row, column + 1] ) + elif parent != "" and column == 0: + self.hierarchy_.add_node(parent) def _export_digraph(self): # Check if edge_list is set From 11f515b21b4f1ec9f5da3a0ffbcd5b65106e327f Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 01:45:27 +0200 Subject: [PATCH 28/56] Update example --- docs/examples/plot_empty_levels.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/examples/plot_empty_levels.py b/docs/examples/plot_empty_levels.py index 6c8ce6ad..89bdcd43 100644 --- a/docs/examples/plot_empty_levels.py +++ b/docs/examples/plot_empty_levels.py @@ -5,20 +5,21 @@ ========================== HiClass supports different number of levels in the hierarchy. -For this example, we will train a local classifier per node -with the following hierarchy: +For this example, we will train a local classifier per parent node +with a hierarchy similar to the following image: -.. figure:: ../algorithms/local_classifier_per_node.svg +.. figure:: ../algorithms/local_classifier_per_parent_node.svg :align: center """ from sklearn.linear_model import LogisticRegression -from hiclass import LocalClassifierPerNode +from hiclass import LocalClassifierPerParentNode # Define data -X_train = [[1], [2], [3], [4]] -X_test = [[4], [3], [2], [1]] +X_train = [[1], [2], [3], [4], [5]] +X_test = [[5], [4], [3], [2], [1]] Y_train = [ + ["Bird"], ["Reptile", "Snake"], ["Reptile", "Lizard"], ["Mammal", "Cat"], @@ -27,7 +28,7 @@ # Use random forest classifiers for every node rf = LogisticRegression() -classifier = LocalClassifierPerNode(local_classifier=rf) +classifier = LocalClassifierPerParentNode(local_classifier=rf) # Train local classifier per node classifier.fit(X_train, Y_train) From df8f451670a5276a4059de3dec8859349df52930 Mon Sep 17 00:00:00 2001 From: Fabio Date: Sat, 25 Jun 2022 01:52:02 +0200 Subject: [PATCH 29/56] Update example --- docs/examples/plot_empty_levels.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/examples/plot_empty_levels.py b/docs/examples/plot_empty_levels.py index 89bdcd43..f3af3469 100644 --- a/docs/examples/plot_empty_levels.py +++ b/docs/examples/plot_empty_levels.py @@ -5,19 +5,19 @@ ========================== HiClass supports different number of levels in the hierarchy. -For this example, we will train a local classifier per parent node +For this example, we will train a local classifier per node with a hierarchy similar to the following image: -.. figure:: ../algorithms/local_classifier_per_parent_node.svg +.. figure:: ../algorithms/local_classifier_per_node.svg :align: center """ from sklearn.linear_model import LogisticRegression -from hiclass import LocalClassifierPerParentNode +from hiclass import LocalClassifierPerNode # Define data -X_train = [[1], [2], [3], [4], [5]] -X_test = [[5], [4], [3], [2], [1]] +X_train = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] +X_test = [[9, 10], [7, 8], [5, 6], [3, 4], [1, 2]] Y_train = [ ["Bird"], ["Reptile", "Snake"], @@ -28,7 +28,7 @@ # Use random forest classifiers for every node rf = LogisticRegression() -classifier = LocalClassifierPerParentNode(local_classifier=rf) +classifier = LocalClassifierPerNode(local_classifier=rf) # Train local classifier per node classifier.fit(X_train, Y_train) From 77639937902d76c8577157fc9b9dac61c267cbe2 Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 18:14:22 +0200 Subject: [PATCH 30/56] Remove addopts --- setup.cfg | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 777b5950..df142dd7 100755 --- a/setup.cfg +++ b/setup.cfg @@ -1,11 +1,7 @@ [tool:pytest] testpaths=hiclass tests -addopts = --flake8 - --pydocstyle - --cov=hiclass - --cov-fail-under=90 - --cov-report html - --disable-warnings +addopts = --disable-warnings + --color=yes --ignore=hiclass/_version.py, [flake8] From c9621b7a9eb93998fca81dbde144ba42c38af10d Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 18:15:52 +0200 Subject: [PATCH 31/56] Add pytest options --- .github/workflows/deploy-pypi.yml | 2 +- .github/workflows/test-pr.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-pypi.yml b/.github/workflows/deploy-pypi.yml index 43a49706..65e11e95 100644 --- a/.github/workflows/deploy-pypi.yml +++ b/.github/workflows/deploy-pypi.yml @@ -30,7 +30,7 @@ jobs: python -m pip install . - name: Test with pytest run: | - pytest -v + pytest -v --flake8 --pydocstyle --cov=hiclass --cov-fail-under=90 --cov-report html coverage xml - name: Upload Coverage to Codecov if: matrix.os == 'ubuntu-latest' diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml index 221cc142..03c0231f 100644 --- a/.github/workflows/test-pr.yml +++ b/.github/workflows/test-pr.yml @@ -29,4 +29,4 @@ jobs: python -m pip install . - name: Test with pytest run: | - pytest -v + pytest -v --flake8 --pydocstyle --cov=hiclass --cov-fail-under=90 --cov-report html From fe3f05925825cba00e841e46babed7ae22875505 Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 18:35:12 +0200 Subject: [PATCH 32/56] Refactor _fit_digraph --- hiclass/HierarchicalClassifier.py | 13 +++++++++++++ hiclass/LocalClassifierPerNode.py | 9 +-------- hiclass/LocalClassifierPerParentNode.py | 9 +-------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 142f3574..e42e2ba4 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -7,6 +7,8 @@ from sklearn.base import BaseEstimator from sklearn.linear_model import LogisticRegression +from hiclass.ConstantClassifier import ConstantClassifier + def _make_leveled(y): """ @@ -276,6 +278,17 @@ def _initialize_local_classifiers(self): else: self.local_classifier_ = self.local_classifier + def _replace_constant_classifier(self, y, node, classifier): + unique_y = np.unique(y) + if len(unique_y) == 1 and self.replace_classifiers: + node_name = str(node).split(self.separator_)[-1] + self.logger_.warning( + f"Fitting ConstantClassifier for node '{node_name}'" + ) + self.hierarchy_.nodes[node]["classifier"] = ConstantClassifier() + classifier = self.hierarchy_.nodes[node]["classifier"] + return classifier + def _clean_up(self): self.logger_.info("Cleaning up variables that can take a lot of disk space") del self.X_ diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index eb28964b..05c8bf15 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -236,14 +236,7 @@ def _fit_digraph(self): ) classifier = self.hierarchy_.nodes[node]["classifier"] X, y = self.binary_policy_.get_binary_examples(node) - unique_y = np.unique(y) - if len(unique_y) == 1 and self.replace_classifiers: - node_name = str(node).split(self.separator_)[-1] - self.logger_.warning( - f"Fitting ConstantClassifier for node '{node_name}'" - ) - self.hierarchy_.nodes[node]["classifier"] = ConstantClassifier() - classifier = self.hierarchy_.nodes[node]["classifier"] + classifier = self._replace_constant_classifier(y, node, classifier) classifier.fit(X, y) def _clean_up(self): diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index f1b38721..af31fd6f 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -216,12 +216,5 @@ def _fit_digraph(self): classifier = self.hierarchy_.nodes[node]["classifier"] # get children examples X, y = self._get_successors(node) - unique_y = np.unique(y) - if len(unique_y) == 1 and self.replace_classifiers: - node_name = str(node).split(self.separator_)[-1] - self.logger_.warning( - f"Fitting ConstantClassifier for node '{node_name}'" - ) - self.hierarchy_.nodes[node]["classifier"] = ConstantClassifier() - classifier = self.hierarchy_.nodes[node]["classifier"] + classifier = self._replace_constant_classifier(y, node, classifier) classifier.fit(X, y) From 840b21e4177368c1d1028aca6c9f1eb331876c6e Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 20:53:32 +0200 Subject: [PATCH 33/56] Refactor predict method for local classifier per parent node --- hiclass/HierarchicalClassifier.py | 14 +++++++ hiclass/LocalClassifierPerLevel.py | 12 ++---- hiclass/LocalClassifierPerNode.py | 12 ++---- hiclass/LocalClassifierPerParentNode.py | 51 +++++++++++-------------- 4 files changed, 43 insertions(+), 46 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index e42e2ba4..29f39666 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -289,6 +289,20 @@ def _replace_constant_classifier(self, y, node, classifier): classifier = self.hierarchy_.nodes[node]["classifier"] return classifier + def _convert_to_1d(self, y): + # Convert back to 1D if there is only 1 column to pass all sklearn's checks + if self.max_levels_ == 1: + y = y.flatten() + return y + + def _remove_separator(self, y): + # Remove separator from predictions + if y.ndim == 2: + for i in range(y.shape[0]): + for j in range(1, y.shape[1]): + y[i, j] = y[i, j].split(self.separator_)[-1] + return y + def _clean_up(self): self.logger_.info("Cleaning up variables that can take a lot of disk space") del self.X_ diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 414b0d97..2f375b48 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -185,15 +185,9 @@ def predict(self, X): ] ) - # Convert back to 1D if there is only 1 column to pass all sklearn's checks - if self.max_levels_ == 1: - y = y.flatten() - - # Remove separator from predictions - if y.ndim == 2: - for i in range(y.shape[0]): - for j in range(1, y.shape[1]): - y[i, j] = y[i, j].split(self.separator_)[-1] + y = self._convert_to_1d(y) + + y = self._remove_separator(y) return y diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index 05c8bf15..e7765128 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -170,15 +170,9 @@ def predict(self, X): prediction = np.array(prediction) y[mask, level] = prediction - # Convert back to 1D if there is only 1 column to pass all sklearn's checks - if self.max_levels_ == 1: - y = y.flatten() - - # Remove separator from predictions - if y.ndim == 2: - for i in range(y.shape[0]): - for j in range(1, y.shape[1]): - y[i, j] = y[i, j].split(self.separator_)[-1] + y = self._convert_to_1d(y) + + y = self._remove_separator(y) return y diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index af31fd6f..7d99a96a 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -128,43 +128,38 @@ def predict(self, X): # Input validation X = check_array(X, accept_sparse="csr") + # Initialize array with predictions y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_) # TODO: Add threshold to stop prediction halfway if need be - bfs = nx.bfs_successors(self.hierarchy_, source=self.root_) - self.logger_.info("Predicting") - for predecessor, successors in bfs: - if predecessor == self.root_: - mask = [True] * X.shape[0] - subset_x = X[mask] - else: - mask = np.isin(y, predecessor).any(axis=1) - subset_x = X[mask] - if subset_x.shape[0] > 0: - classifier = self.hierarchy_.nodes[predecessor]["classifier"] - prediction = classifier.predict(subset_x) - level = nx.shortest_path_length( - self.hierarchy_, self.root_, predecessor - ) - if prediction.ndim == 2 and prediction.shape[1] == 1: - prediction = prediction.flatten() - y[mask, level] = prediction - - # Convert back to 1D if there is only 1 column to pass all sklearn's checks - if self.max_levels_ == 1: - y = y.flatten() - - # Remove separator from predictions - if y.ndim == 2: - for i in range(y.shape[0]): - for j in range(1, y.shape[1]): - y[i, j] = y[i, j].split(self.separator_)[-1] + # Predict first level + classifier = self.hierarchy_.nodes[self.root_]["classifier"] + y[:, 0] = classifier.predict(X) + + self._predict_remaining_levels(X, y) + + y = self._convert_to_1d(y) + + y = self._remove_separator(y) return y + def _predict_remaining_levels(self, X, y): + for level in range(1, y.shape[1]): + predecessors = set(y[:, level - 1]) + predecessors.discard("") + for predecessor in predecessors: + mask = np.isin(y, predecessor).any(axis=1) + predecessor_x = X[mask] + if predecessor_x.shape[0] > 0: + successors = list(self.hierarchy_.successors(predecessor)) + if len(successors) > 0: + classifier = self.hierarchy_.nodes[predecessor]["classifier"] + y[mask, level] = classifier.predict(predecessor_x).flatten() + def _initialize_local_classifiers(self): super()._initialize_local_classifiers() local_classifiers = {} From 4f2e695167c1c9fb61983db154b91bda3ef40480 Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 21:00:38 +0200 Subject: [PATCH 34/56] Enforce finding successors only on previous level --- hiclass/LocalClassifierPerParentNode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 7d99a96a..52ccaf5f 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -128,7 +128,7 @@ def predict(self, X): # Input validation X = check_array(X, accept_sparse="csr") - # Initialize array with predictions + # Initialize array that holds predictions y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_) # TODO: Add threshold to stop prediction halfway if need be @@ -152,7 +152,7 @@ def _predict_remaining_levels(self, X, y): predecessors = set(y[:, level - 1]) predecessors.discard("") for predecessor in predecessors: - mask = np.isin(y, predecessor).any(axis=1) + mask = np.isin(y[:, level - 1], predecessor) predecessor_x = X[mask] if predecessor_x.shape[0] > 0: successors = list(self.hierarchy_.successors(predecessor)) From 67d03c0a1fbb05e7aac97d958a5d82caa25c82a5 Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 21:11:02 +0200 Subject: [PATCH 35/56] Remove redundant return statement --- hiclass/HierarchicalClassifier.py | 1 - hiclass/LocalClassifierPerLevel.py | 2 +- hiclass/LocalClassifierPerNode.py | 2 +- hiclass/LocalClassifierPerParentNode.py | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 29f39666..f8e5207c 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -301,7 +301,6 @@ def _remove_separator(self, y): for i in range(y.shape[0]): for j in range(1, y.shape[1]): y[i, j] = y[i, j].split(self.separator_)[-1] - return y def _clean_up(self): self.logger_.info("Cleaning up variables that can take a lot of disk space") diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 2f375b48..a6c4af49 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -187,7 +187,7 @@ def predict(self, X): y = self._convert_to_1d(y) - y = self._remove_separator(y) + self._remove_separator(y) return y diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index e7765128..ffefc17c 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -172,7 +172,7 @@ def predict(self, X): y = self._convert_to_1d(y) - y = self._remove_separator(y) + self._remove_separator(y) return y diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 52ccaf5f..b4484569 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -143,7 +143,7 @@ def predict(self, X): y = self._convert_to_1d(y) - y = self._remove_separator(y) + self._remove_separator(y) return y From 223c139e25fecfb8bfa258e9c0ef092823551839 Mon Sep 17 00:00:00 2001 From: Fabio Date: Tue, 28 Jun 2022 22:04:33 +0200 Subject: [PATCH 36/56] Update comment --- hiclass/HierarchicalClassifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index f8e5207c..2a9c2c08 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -290,7 +290,7 @@ def _replace_constant_classifier(self, y, node, classifier): return classifier def _convert_to_1d(self, y): - # Convert back to 1D if there is only 1 column to pass all sklearn's checks + # Convert predictions to 1D if there is only 1 column if self.max_levels_ == 1: y = y.flatten() return y From 4de0dbd7d16bbf248552799dcc9b4f086672ad25 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 00:14:54 +0200 Subject: [PATCH 37/56] Refactor predict() method for local classifier per level --- hiclass/HierarchicalClassifier.py | 4 +- hiclass/LocalClassifierPerLevel.py | 95 ++++++++++++------------- hiclass/LocalClassifierPerParentNode.py | 2 +- 3 files changed, 46 insertions(+), 55 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 2a9c2c08..a4a7f64e 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -282,9 +282,7 @@ def _replace_constant_classifier(self, y, node, classifier): unique_y = np.unique(y) if len(unique_y) == 1 and self.replace_classifiers: node_name = str(node).split(self.separator_)[-1] - self.logger_.warning( - f"Fitting ConstantClassifier for node '{node_name}'" - ) + self.logger_.warning(f"Fitting ConstantClassifier for node '{node_name}'") self.hierarchy_.nodes[node]["classifier"] = ConstantClassifier() classifier = self.hierarchy_.nodes[node]["classifier"] return classifier diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index a6c4af49..145b49a0 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -29,7 +29,15 @@ def _parallel_fit(lcpl, level, separator): if len(unique_y) == 1 and lcpl.replace_classifiers: classifier = ConstantClassifier() classifier.fit(X, y) - return mask, classifier + return classifier + + +def _get_successors_probability(probabilities_dict, successors): + successors_probability = [ + np.array([probabilities_dict[i][successor] for successor in successors_list]) + for i, successors_list in enumerate(successors) + ] + return successors_probability class LocalClassifierPerLevel(BaseEstimator, HierarchicalClassifier): @@ -139,51 +147,11 @@ def predict(self, X): self.logger_.info("Predicting") - for level, classifier in enumerate(self.local_classifiers_): - self.logger_.info(f"Predicting level {level}") - if level == 0: - y[:, level] = classifier.predict(X).flatten() - else: - all_probabilities = classifier.predict_proba(X) - successors = np.array( - [ - list(self.hierarchy_.successors(node)) - for node in y[:, level - 1] - ], - dtype=object, - ) - classes_masks = np.array( - [ - np.isin(classifier.classes_, successors[i]) - for i in range(len(successors)) - ] - ) - probabilities = np.array( - [ - all_probabilities[i, classes_masks[i]] - for i in range(len(classes_masks)) - ], - dtype=object, - ) - highest_probabilities = [ - np.argmax(probabilities[i], axis=0) - for i in range(len(probabilities)) - if len(probabilities[i] > 0) - ] - classes = np.array( - [ - classifier.classes_[classes_masks[i]] - for i in range(len(classes_masks)) - ], - dtype=object, - ) - classes = classes[self.masks_[level]] - y[self.masks_[level], level] = np.array( - [ - classes[i][highest_probabilities[i]] - for i in range(len(highest_probabilities)) - ] - ) + # Predict first level + classifier = self.local_classifiers_[0] + y[:, 0] = classifier.predict(X).flatten() + + self._predict_remaining_levels(X, y) y = self._convert_to_1d(y) @@ -191,6 +159,35 @@ def predict(self, X): return y + def _predict_remaining_levels(self, X, y): + for level in range(1, y.shape[1]): + classifier = self.local_classifiers_[level] + probabilities = classifier.predict_proba(X) + classes = self.local_classifiers_[level].classes_ + probabilities_dict = [dict(zip(classes, prob)) for prob in probabilities] + successors = self._get_successors(y[:, level - 1]) + successors_prob = _get_successors_probability( + probabilities_dict, successors + ) + max_probability = [ + np.argmax(prob) if len(prob) > 0 else None for prob in successors_prob + ] + y[:, level] = [ + successors_list[max_probability[i]] + if max_probability[i] is not None + else "" + for i, successors_list in enumerate(successors) + ] + + def _get_successors(self, level): + successors = [ + list(self.hierarchy_.successors(node)) + if self.hierarchy_.has_node(node) + else [] + for node in level + ] + return successors + def _initialize_local_classifiers(self): super()._initialize_local_classifiers() self.local_classifiers_ = [ @@ -215,9 +212,6 @@ def _fit_digraph(self): X = X[mask] y = y[mask] - # Store mask for current level - self.masks_[level] = mask - unique_y = np.unique(y) if len(unique_y) == 1 and self.replace_classifiers: self.logger_.warning( @@ -236,6 +230,5 @@ def _fit_digraph_parallel(self, local_mode: bool = False): for level in range(len(self.local_classifiers_)) ] classifiers = ray.get(results) - for level, (mask, classifier) in enumerate(classifiers): - self.masks_[level] = mask + for level, classifier in enumerate(classifiers): self.local_classifiers_[level] = classifier diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index b4484569..4e66964d 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -137,7 +137,7 @@ def predict(self, X): # Predict first level classifier = self.hierarchy_.nodes[self.root_]["classifier"] - y[:, 0] = classifier.predict(X) + y[:, 0] = classifier.predict(X).flatten() self._predict_remaining_levels(X, y) From 0daedf9e754b7d8d7020858eb319ce52978f9b53 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 00:18:57 +0200 Subject: [PATCH 38/56] Refactor variable --- hiclass/LocalClassifierPerLevel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 145b49a0..ed3f1417 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -169,12 +169,12 @@ def _predict_remaining_levels(self, X, y): successors_prob = _get_successors_probability( probabilities_dict, successors ) - max_probability = [ + index_max_probability = [ np.argmax(prob) if len(prob) > 0 else None for prob in successors_prob ] y[:, level] = [ - successors_list[max_probability[i]] - if max_probability[i] is not None + successors_list[index_max_probability[i]] + if index_max_probability[i] is not None else "" for i, successors_list in enumerate(successors) ] From e5e49c195bfe99b723990d4ed908356c94fc7677 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 00:28:49 +0200 Subject: [PATCH 39/56] Refactor repeated code --- hiclass/LocalClassifierPerLevel.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index ed3f1417..8be52dca 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -19,12 +19,9 @@ def _parallel_fit(lcpl, level, separator): classifier = lcpl.local_classifiers_[level] X = lcpl.X_ y = lcpl.y_[:, level] - # Detect empty leaf nodes - leaves = np.array([str(i).split(separator)[-1] for i in y]) - mask = leaves != "" - # Remove rows with empty leaf nodes - X = X[mask] - y = y[mask] + + X, y = _remove_empty_leaves(separator, X, y) + unique_y = np.unique(y) if len(unique_y) == 1 and lcpl.replace_classifiers: classifier = ConstantClassifier() @@ -32,6 +29,13 @@ def _parallel_fit(lcpl, level, separator): return classifier +def _remove_empty_leaves(separator, X, y): + # Detect rows where leaves are not empty + leaves = np.array([str(i).split(separator)[-1] for i in y]) + mask = leaves != "" + return X[mask], y[mask] + + def _get_successors_probability(probabilities_dict, successors): successors_probability = [ np.array([probabilities_dict[i][successor] for successor in successors_list]) @@ -204,13 +208,7 @@ def _fit_digraph(self): X = self.X_ y = self.y_[:, level] - # Detect empty leaf nodes - leaves = np.array([str(i).split(self.separator_)[-1] for i in y]) - mask = leaves != "" - - # Remove rows with empty leaf nodes - X = X[mask] - y = y[mask] + X, y = _remove_empty_leaves(self.separator_, X, y) unique_y = np.unique(y) if len(unique_y) == 1 and self.replace_classifiers: From 468a775f920f75dd06274332a9a5b00b3b5212a5 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 00:30:30 +0200 Subject: [PATCH 40/56] Remove useless variables --- hiclass/LocalClassifierPerLevel.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 8be52dca..6d1e22e3 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -17,10 +17,8 @@ @ray.remote def _parallel_fit(lcpl, level, separator): classifier = lcpl.local_classifiers_[level] - X = lcpl.X_ - y = lcpl.y_[:, level] - X, y = _remove_empty_leaves(separator, X, y) + X, y = _remove_empty_leaves(separator, lcpl.X_, lcpl.y_[:, level]) unique_y = np.unique(y) if len(unique_y) == 1 and lcpl.replace_classifiers: @@ -205,10 +203,8 @@ def _fit_digraph(self): self.logger_.info( f"Fitting local classifier for level '{level + 1}' ({level + 1}/{len(self.local_classifiers_)})" ) - X = self.X_ - y = self.y_[:, level] - X, y = _remove_empty_leaves(self.separator_, X, y) + X, y = _remove_empty_leaves(self.separator_, self.X_, self.y_[:, level]) unique_y = np.unique(y) if len(unique_y) == 1 and self.replace_classifiers: From 58babf312b9c08b59995554de374e95fc076e97c Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 13:18:14 +0200 Subject: [PATCH 41/56] Add examples --- hiclass/LocalClassifierPerLevel.py | 11 +++++++++++ hiclass/LocalClassifierPerNode.py | 11 +++++++++++ hiclass/LocalClassifierPerParentNode.py | 11 +++++++++++ 3 files changed, 33 insertions(+) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 6d1e22e3..3414dcac 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -48,6 +48,17 @@ class LocalClassifierPerLevel(BaseEstimator, HierarchicalClassifier): A local classifier per level is a local hierarchical classifier that fits one local multi-class classifier for each level of the class hierarchy, except for the root node. + + Examples + -------- + >>> from hiclass import LocalClassifierPerLevel + >>> y = [['1', '1.1'], ['2', '2.1']] + >>> X = [[1, 2], [3, 4]] + >>> lcpl = LocalClassifierPerLevel() + >>> lcpl.fit(X, y) + >>> lcpl.predict(X) + array([['1', '1.1'], + ['2', '2.1']]) """ def __init__( diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index ffefc17c..cb029859 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -33,6 +33,17 @@ class LocalClassifierPerNode(BaseEstimator, HierarchicalClassifier): A local classifier per node is a local hierarchical classifier that fits one local binary classifier for each node of the class hierarchy, except for the root node. + + Examples + -------- + >>> from hiclass import LocalClassifierPerNode + >>> y = [['1', '1.1'], ['2', '2.1']] + >>> X = [[1, 2], [3, 4]] + >>> lcpn = LocalClassifierPerNode() + >>> lcpn.fit(X, y) + >>> lcpn.predict(X) + array([['1', '1.1'], + ['2', '2.1']]) """ def __init__( diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 4e66964d..a60582c3 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -33,6 +33,17 @@ class LocalClassifierPerParentNode(BaseEstimator, HierarchicalClassifier): A local classifier per parent node is a local hierarchical classifier that fits one multi-class classifier for each parent node of the class hierarchy. + + Examples + -------- + >>> from hiclass import LocalClassifierPerParentNode + >>> y = [['1', '1.1'], ['2', '2.1']] + >>> X = [[1, 2], [3, 4]] + >>> lcppn = LocalClassifierPerParentNode() + >>> lcppn.fit(X, y) + >>> lcppn.predict(X) + array([['1', '1.1'], + ['2', '2.1']]) """ def __init__( From aa294dcb7676697be7e8f3cb2975beb2456a4c9d Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 14:04:34 +0200 Subject: [PATCH 42/56] Add comments --- hiclass/LocalClassifierPerLevel.py | 1 + hiclass/LocalClassifierPerNode.py | 1 + 2 files changed, 2 insertions(+) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 3414dcac..cf6dbd78 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -154,6 +154,7 @@ def predict(self, X): # Input validation X = check_array(X, accept_sparse="csr") + # Initialize array that holds predictions y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_) # TODO: Add threshold to stop prediction halfway if need be diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index cb029859..ca4c017d 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -146,6 +146,7 @@ def predict(self, X): # Input validation X = check_array(X, accept_sparse="csr") + # Initialize array that holds predictions y = np.empty((X.shape[0], self.max_levels_), dtype=self.dtype_) # TODO: Add threshold to stop prediction halfway if need be From 8ef74d16636ca7dc182b2c7c48f0f724feca6efb Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 14:23:03 +0200 Subject: [PATCH 43/56] Remove useless variables --- hiclass/HierarchicalClassifier.py | 14 +------- hiclass/LocalClassifierPerLevel.py | 41 +++++++--------------- hiclass/LocalClassifierPerNode.py | 32 ++++++----------- hiclass/LocalClassifierPerParentNode.py | 31 ++++++---------- tests/test_LocalClassifierPerLevel.py | 17 +-------- tests/test_LocalClassifierPerNode.py | 21 +---------- tests/test_LocalClassifierPerParentNode.py | 20 +---------- 7 files changed, 37 insertions(+), 139 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index a4a7f64e..a1c3e929 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -109,10 +109,7 @@ def fit(self, X, y): Fitted estimator. """ # Fit local classifiers in DAG - if self.n_jobs > 1: - self._fit_digraph_parallel() - else: - self._fit_digraph() + self._fit_digraph() # Delete unnecessary variables self._clean_up() @@ -278,15 +275,6 @@ def _initialize_local_classifiers(self): else: self.local_classifier_ = self.local_classifier - def _replace_constant_classifier(self, y, node, classifier): - unique_y = np.unique(y) - if len(unique_y) == 1 and self.replace_classifiers: - node_name = str(node).split(self.separator_)[-1] - self.logger_.warning(f"Fitting ConstantClassifier for node '{node_name}'") - self.hierarchy_.nodes[node]["classifier"] = ConstantClassifier() - classifier = self.hierarchy_.nodes[node]["classifier"] - return classifier - def _convert_to_1d(self, y): # Convert predictions to 1D if there is only 1 column if self.max_levels_ == 1: diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index cf6dbd78..6ae5cab2 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -14,8 +14,7 @@ from hiclass.HierarchicalClassifier import HierarchicalClassifier -@ray.remote -def _parallel_fit(lcpl, level, separator): +def _fit_classifier(lcpl, level, separator): classifier = lcpl.local_classifiers_[level] X, y = _remove_empty_leaves(separator, lcpl.X_, lcpl.y_[:, level]) @@ -209,32 +208,18 @@ def _initialize_local_classifiers(self): ] self.masks_ = [None for _ in range(self.y_.shape[1])] - def _fit_digraph(self): + def _fit_digraph(self, local_mode: bool = False): self.logger_.info("Fitting local classifiers") - for level, classifier in enumerate(self.local_classifiers_): - self.logger_.info( - f"Fitting local classifier for level '{level + 1}' ({level + 1}/{len(self.local_classifiers_)})" - ) - - X, y = _remove_empty_leaves(self.separator_, self.X_, self.y_[:, level]) - - unique_y = np.unique(y) - if len(unique_y) == 1 and self.replace_classifiers: - self.logger_.warning( - f"Fitting ConstantClassifier for level '{level + 1}'" - ) - self.local_classifiers_[level] = ConstantClassifier() - classifier = self.local_classifiers_[level] - classifier.fit(X, y) - - def _fit_digraph_parallel(self, local_mode: bool = False): - self.logger_.info("Fitting local classifiers") - ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) - lcpl = ray.put(self) - results = [ - _parallel_fit.remote(lcpl, level, self.separator_) - for level in range(len(self.local_classifiers_)) - ] - classifiers = ray.get(results) + if self.n_jobs > 1: + ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) + lcpl = ray.put(self) + _parallel_fit = ray.remote(_fit_classifier) + results = [ + _parallel_fit.remote(lcpl, level, self.separator_) + for level in range(len(self.local_classifiers_)) + ] + classifiers = ray.get(results) + else: + classifiers = [_fit_classifier(self, level, self.separator_) for level in range(len(self.local_classifiers_))] for level, classifier in enumerate(classifiers): self.local_classifiers_[level] = classifier diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index ca4c017d..99a41760 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -16,8 +16,7 @@ from hiclass.HierarchicalClassifier import HierarchicalClassifier -@ray.remote -def _parallel_fit(lcpn, node): +def _fit_classifier(lcpn, node): classifier = lcpn.hierarchy_.nodes[node]["classifier"] X, y = lcpn.binary_policy_.get_binary_examples(node) unique_y = np.unique(y) @@ -218,33 +217,22 @@ def _initialize_local_classifiers(self): } nx.set_node_attributes(self.hierarchy_, local_classifiers) - def _fit_digraph_parallel(self, local_mode: bool = False): + def _fit_digraph(self, local_mode: bool = False): self.logger_.info("Fitting local classifiers") - ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) nodes = list(self.hierarchy_.nodes) # Remove root because it does not need to be fitted nodes.remove(self.root_) - lcpn = ray.put(self) - results = [_parallel_fit.remote(lcpn, node) for node in nodes] - classifiers = ray.get(results) + if self.n_jobs > 1: + ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) + lcpn = ray.put(self) + _parallel_fit = ray.remote(_fit_classifier) + results = [_parallel_fit.remote(lcpn, node) for node in nodes] + classifiers = ray.get(results) + else: + classifiers = [_fit_classifier(self, node) for node in nodes] for classifier, node in zip(classifiers, nodes): self.hierarchy_.nodes[node]["classifier"] = classifier - def _fit_digraph(self): - self.logger_.info("Fitting local classifiers") - nodes = list(self.hierarchy_.nodes) - # Remove root because it does not need to be fitted - nodes.remove(self.root_) - for index, node in enumerate(nodes): - node_name = str(node).split(self.separator_)[-1] - self.logger_.info( - f"Fitting local classifier for node '{node_name}' ({index + 1}/{len(nodes)})" - ) - classifier = self.hierarchy_.nodes[node]["classifier"] - X, y = self.binary_policy_.get_binary_examples(node) - classifier = self._replace_constant_classifier(y, node, classifier) - classifier.fit(X, y) - def _clean_up(self): super()._clean_up() del self.binary_policy_ diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index a60582c3..54214a8a 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -15,8 +15,7 @@ from hiclass.HierarchicalClassifier import HierarchicalClassifier -@ray.remote -def _parallel_fit(lcppn, node): +def _fit_classifier(lcppn, node): classifier = lcppn.hierarchy_.nodes[node]["classifier"] # get children examples X, y = lcppn._get_successors(node) @@ -201,26 +200,16 @@ def _get_successors(self, node): y = np.array(y) return X, y - def _fit_digraph_parallel(self, local_mode: bool = False): + def _fit_digraph(self, local_mode: bool = False): self.logger_.info("Fitting local classifiers") - ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) nodes = self._get_parents() - lcppn = ray.put(self) - results = [_parallel_fit.remote(lcppn, node) for node in nodes] - classifiers = ray.get(results) + if self.n_jobs > 1: + ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) + lcppn = ray.put(self) + _parallel_fit = ray.remote(_fit_classifier) + results = [_parallel_fit.remote(lcppn, node) for node in nodes] + classifiers = ray.get(results) + else: + classifiers = [_fit_classifier(self, node) for node in nodes] for classifier, node in zip(classifiers, nodes): self.hierarchy_.nodes[node]["classifier"] = classifier - - def _fit_digraph(self): - self.logger_.info("Fitting local classifiers") - nodes = self._get_parents() - for index, node in enumerate(nodes): - node_name = str(node).split(self.separator_)[-1] - self.logger_.info( - f"Fitting local classifier for node '{node_name}' ({index + 1}/{len(nodes)})" - ) - classifier = self.hierarchy_.nodes[node]["classifier"] - # get children examples - X, y = self._get_successors(node) - classifier = self._replace_constant_classifier(y, node, classifier) - classifier.fit(X, y) diff --git a/tests/test_LocalClassifierPerLevel.py b/tests/test_LocalClassifierPerLevel.py index 43f5aabe..0184e9a3 100644 --- a/tests/test_LocalClassifierPerLevel.py +++ b/tests/test_LocalClassifierPerLevel.py @@ -44,28 +44,13 @@ def test_initialize_local_classifiers(digraph_logistic_regression): def test_fit_digraph(digraph_logistic_regression): - classifiers = [ - LogisticRegression(), - LogisticRegression(), - ] - digraph_logistic_regression.local_classifiers_ = classifiers - digraph_logistic_regression._fit_digraph() - for classifier in digraph_logistic_regression.local_classifiers_: - try: - check_is_fitted(classifier) - except NotFittedError as e: - pytest.fail(repr(e)) - assert 1 - - -def test_fit_digraph_parallel(digraph_logistic_regression): classifiers = [ LogisticRegression(), LogisticRegression(), ] digraph_logistic_regression.n_jobs = 2 digraph_logistic_regression.local_classifiers_ = classifiers - digraph_logistic_regression._fit_digraph_parallel(local_mode=True) + digraph_logistic_regression._fit_digraph(local_mode=True) for classifier in digraph_logistic_regression.local_classifiers_: try: check_is_fitted(classifier) diff --git a/tests/test_LocalClassifierPerNode.py b/tests/test_LocalClassifierPerNode.py index 9a51deeb..0a199d0b 100644 --- a/tests/test_LocalClassifierPerNode.py +++ b/tests/test_LocalClassifierPerNode.py @@ -92,32 +92,13 @@ def test_initialize_local_classifiers(digraph_logistic_regression): def test_fit_digraph(digraph_logistic_regression): - classifiers = { - "b": {"classifier": LogisticRegression()}, - "c": {"classifier": LogisticRegression()}, - } - nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers) - digraph_logistic_regression._fit_digraph() - with pytest.raises(KeyError): - check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"]) - for node in ["b", "c"]: - try: - check_is_fitted( - digraph_logistic_regression.hierarchy_.nodes[node]["classifier"] - ) - except NotFittedError as e: - pytest.fail(repr(e)) - assert 1 - - -def test_fit_digraph_parallel(digraph_logistic_regression): classifiers = { "b": {"classifier": LogisticRegression()}, "c": {"classifier": LogisticRegression()}, } digraph_logistic_regression.n_jobs = 2 nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers) - digraph_logistic_regression._fit_digraph_parallel(local_mode=True) + digraph_logistic_regression._fit_digraph(local_mode=True) with pytest.raises(KeyError): check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"]) for node in ["b", "c"]: diff --git a/tests/test_LocalClassifierPerParentNode.py b/tests/test_LocalClassifierPerParentNode.py index 80837f95..9ded1d6d 100644 --- a/tests/test_LocalClassifierPerParentNode.py +++ b/tests/test_LocalClassifierPerParentNode.py @@ -48,30 +48,12 @@ def test_initialize_local_classifiers(digraph_logistic_regression): def test_fit_digraph(digraph_logistic_regression): - classifiers = { - "a": {"classifier": LogisticRegression()}, - } - nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers) - digraph_logistic_regression._fit_digraph() - try: - check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"]) - except NotFittedError as e: - pytest.fail(repr(e)) - for node in ["b", "c"]: - with pytest.raises(KeyError): - check_is_fitted( - digraph_logistic_regression.hierarchy_.nodes[node]["classifier"] - ) - assert 1 - - -def test_fit_digraph_parallel(digraph_logistic_regression): classifiers = { "a": {"classifier": LogisticRegression()}, } digraph_logistic_regression.n_jobs = 2 nx.set_node_attributes(digraph_logistic_regression.hierarchy_, classifiers) - digraph_logistic_regression._fit_digraph_parallel(local_mode=True) + digraph_logistic_regression._fit_digraph(local_mode=True) try: check_is_fitted(digraph_logistic_regression.hierarchy_.nodes["a"]["classifier"]) except NotFittedError as e: From 342f69d5b26611ce6346ab01e1250aac4b52ba61 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 14:30:58 +0200 Subject: [PATCH 44/56] Apply black --- hiclass/LocalClassifierPerLevel.py | 9 +++++++-- hiclass/LocalClassifierPerNode.py | 4 +++- hiclass/LocalClassifierPerParentNode.py | 4 +++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index 6ae5cab2..fb29a5b5 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -211,7 +211,9 @@ def _initialize_local_classifiers(self): def _fit_digraph(self, local_mode: bool = False): self.logger_.info("Fitting local classifiers") if self.n_jobs > 1: - ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) + ray.init( + num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True + ) lcpl = ray.put(self) _parallel_fit = ray.remote(_fit_classifier) results = [ @@ -220,6 +222,9 @@ def _fit_digraph(self, local_mode: bool = False): ] classifiers = ray.get(results) else: - classifiers = [_fit_classifier(self, level, self.separator_) for level in range(len(self.local_classifiers_))] + classifiers = [ + _fit_classifier(self, level, self.separator_) + for level in range(len(self.local_classifiers_)) + ] for level, classifier in enumerate(classifiers): self.local_classifiers_[level] = classifier diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index 99a41760..699437ab 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -223,7 +223,9 @@ def _fit_digraph(self, local_mode: bool = False): # Remove root because it does not need to be fitted nodes.remove(self.root_) if self.n_jobs > 1: - ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) + ray.init( + num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True + ) lcpn = ray.put(self) _parallel_fit = ray.remote(_fit_classifier) results = [_parallel_fit.remote(lcpn, node) for node in nodes] diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index 54214a8a..f1873ddc 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -204,7 +204,9 @@ def _fit_digraph(self, local_mode: bool = False): self.logger_.info("Fitting local classifiers") nodes = self._get_parents() if self.n_jobs > 1: - ray.init(num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True) + ray.init( + num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True + ) lcppn = ray.put(self) _parallel_fit = ray.remote(_fit_classifier) results = [_parallel_fit.remote(lcppn, node) for node in nodes] From ee396129accc7750e2d48764de2963b9eaa80c61 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 15:02:25 +0200 Subject: [PATCH 45/56] More refactoring --- hiclass/HierarchicalClassifier.py | 83 ++++++++++++++----------- hiclass/LocalClassifierPerLevel.py | 60 +++++++++--------- hiclass/LocalClassifierPerNode.py | 34 ++++------ hiclass/LocalClassifierPerParentNode.py | 36 ++++------- tests/test_HierarchicalClassifier.py | 5 +- 5 files changed, 101 insertions(+), 117 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index a1c3e929..6777d677 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -4,45 +4,10 @@ import networkx as nx import numpy as np +import ray from sklearn.base import BaseEstimator from sklearn.linear_model import LogisticRegression -from hiclass.ConstantClassifier import ConstantClassifier - - -def _make_leveled(y): - """ - Add empty cells if columns' length differs. - - Parameters - ---------- - y : array-like of shape (n_samples, n_levels) - The target values, i.e., hierarchical class labels for classification. - - Returns - ------- - leveled_y : array-like of shape (n_samples, n_levels) - The leveled target values, i.e., hierarchical class labels for classification. - - Notes - ----- - If rows are not iterable, returns the current y without modifications. - - Examples - -------- - >>> from hiclass.HierarchicalClassifier import _make_leveled - >>> y = [['a'], ['b', 'c']] - >>> leveled_y = _make_leveled(y) - >>> print(leveled_y) - >>> [['a', ''], ['b', 'c']] - """ - try: - depth = max([len(row) for row in y]) - except TypeError: - return y - leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] - return np.array(leveled_y) - class HierarchicalClassifier(abc.ABC): """Abstract class for the local hierarchical classifiers. @@ -122,7 +87,7 @@ def _pre_fit(self, X, y): X, y, multi_output=True, accept_sparse="csr" ) - self.y_ = _make_leveled(self.y_) + self.y_ = self._make_leveled(self.y_) # Create and configure logger self._create_logger() @@ -288,6 +253,50 @@ def _remove_separator(self, y): for j in range(1, y.shape[1]): y[i, j] = y[i, j].split(self.separator_)[-1] + @staticmethod + def _make_leveled(y): + """ + Add empty cells if columns' length differs. + + Parameters + ---------- + y : array-like of shape (n_samples, n_levels) + The target values, i.e., hierarchical class labels for classification. + + Returns + ------- + leveled_y : array-like of shape (n_samples, n_levels) + The leveled target values, i.e., hierarchical class labels for classification. + + Notes + ----- + If rows are not iterable, returns the current y without modifications. + """ + try: + depth = max([len(row) for row in y]) + except TypeError: + return y + leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] + return np.array(leveled_y) + + def _fit_node_classifier(self, nodes, local_mode): + if self.n_jobs > 1: + ray.init( + num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True + ) + lcppn = ray.put(self) + _parallel_fit = ray.remote(self._fit_classifier) + results = [_parallel_fit.remote(lcppn, node) for node in nodes] + classifiers = ray.get(results) + else: + classifiers = [self._fit_classifier(self, node) for node in nodes] + for classifier, node in zip(classifiers, nodes): + self.hierarchy_.nodes[node]["classifier"] = classifier + + @staticmethod + def _fit_classifier(self, node): + raise NotImplementedError("Method should be implemented in the LCPN and LCPPN") + def _clean_up(self): self.logger_.info("Cleaning up variables that can take a lot of disk space") del self.X_ diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index fb29a5b5..eac5bd6a 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -14,33 +14,6 @@ from hiclass.HierarchicalClassifier import HierarchicalClassifier -def _fit_classifier(lcpl, level, separator): - classifier = lcpl.local_classifiers_[level] - - X, y = _remove_empty_leaves(separator, lcpl.X_, lcpl.y_[:, level]) - - unique_y = np.unique(y) - if len(unique_y) == 1 and lcpl.replace_classifiers: - classifier = ConstantClassifier() - classifier.fit(X, y) - return classifier - - -def _remove_empty_leaves(separator, X, y): - # Detect rows where leaves are not empty - leaves = np.array([str(i).split(separator)[-1] for i in y]) - mask = leaves != "" - return X[mask], y[mask] - - -def _get_successors_probability(probabilities_dict, successors): - successors_probability = [ - np.array([probabilities_dict[i][successor] for successor in successors_list]) - for i, successors_list in enumerate(successors) - ] - return successors_probability - - class LocalClassifierPerLevel(BaseEstimator, HierarchicalClassifier): """ Assign local classifiers to each level of the hierarchy, except the root node. @@ -179,7 +152,7 @@ def _predict_remaining_levels(self, X, y): classes = self.local_classifiers_[level].classes_ probabilities_dict = [dict(zip(classes, prob)) for prob in probabilities] successors = self._get_successors(y[:, level - 1]) - successors_prob = _get_successors_probability( + successors_prob = self._get_successors_probability( probabilities_dict, successors ) index_max_probability = [ @@ -192,6 +165,14 @@ def _predict_remaining_levels(self, X, y): for i, successors_list in enumerate(successors) ] + @staticmethod + def _get_successors_probability(probabilities_dict, successors): + successors_probability = [ + np.array([probabilities_dict[i][successor] for successor in successors_list]) + for i, successors_list in enumerate(successors) + ] + return successors_probability + def _get_successors(self, level): successors = [ list(self.hierarchy_.successors(node)) @@ -215,7 +196,7 @@ def _fit_digraph(self, local_mode: bool = False): num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True ) lcpl = ray.put(self) - _parallel_fit = ray.remote(_fit_classifier) + _parallel_fit = ray.remote(self._fit_classifier) results = [ _parallel_fit.remote(lcpl, level, self.separator_) for level in range(len(self.local_classifiers_)) @@ -223,8 +204,27 @@ def _fit_digraph(self, local_mode: bool = False): classifiers = ray.get(results) else: classifiers = [ - _fit_classifier(self, level, self.separator_) + self._fit_classifier(self, level, self.separator_) for level in range(len(self.local_classifiers_)) ] for level, classifier in enumerate(classifiers): self.local_classifiers_[level] = classifier + + @staticmethod + def _fit_classifier(self, level, separator): + classifier = self.local_classifiers_[level] + + X, y = self._remove_empty_leaves(separator, self.X_, self.y_[:, level]) + + unique_y = np.unique(y) + if len(unique_y) == 1 and self.replace_classifiers: + classifier = ConstantClassifier() + classifier.fit(X, y) + return classifier + + @staticmethod + def _remove_empty_leaves(separator, X, y): + # Detect rows where leaves are not empty + leaves = np.array([str(i).split(separator)[-1] for i in y]) + mask = leaves != "" + return X[mask], y[mask] diff --git a/hiclass/LocalClassifierPerNode.py b/hiclass/LocalClassifierPerNode.py index 699437ab..1bdcf705 100644 --- a/hiclass/LocalClassifierPerNode.py +++ b/hiclass/LocalClassifierPerNode.py @@ -7,7 +7,6 @@ import networkx as nx import numpy as np -import ray from sklearn.base import BaseEstimator from sklearn.utils.validation import check_array, check_is_fitted @@ -16,16 +15,6 @@ from hiclass.HierarchicalClassifier import HierarchicalClassifier -def _fit_classifier(lcpn, node): - classifier = lcpn.hierarchy_.nodes[node]["classifier"] - X, y = lcpn.binary_policy_.get_binary_examples(node) - unique_y = np.unique(y) - if len(unique_y) == 1 and lcpn.replace_classifiers: - classifier = ConstantClassifier() - classifier.fit(X, y) - return classifier - - class LocalClassifierPerNode(BaseEstimator, HierarchicalClassifier): """ Assign local classifiers to each node of the graph, except the root node. @@ -222,18 +211,17 @@ def _fit_digraph(self, local_mode: bool = False): nodes = list(self.hierarchy_.nodes) # Remove root because it does not need to be fitted nodes.remove(self.root_) - if self.n_jobs > 1: - ray.init( - num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True - ) - lcpn = ray.put(self) - _parallel_fit = ray.remote(_fit_classifier) - results = [_parallel_fit.remote(lcpn, node) for node in nodes] - classifiers = ray.get(results) - else: - classifiers = [_fit_classifier(self, node) for node in nodes] - for classifier, node in zip(classifiers, nodes): - self.hierarchy_.nodes[node]["classifier"] = classifier + self._fit_node_classifier(nodes, local_mode) + + @staticmethod + def _fit_classifier(self, node): + classifier = self.hierarchy_.nodes[node]["classifier"] + X, y = self.binary_policy_.get_binary_examples(node) + unique_y = np.unique(y) + if len(unique_y) == 1 and self.replace_classifiers: + classifier = ConstantClassifier() + classifier.fit(X, y) + return classifier def _clean_up(self): super()._clean_up() diff --git a/hiclass/LocalClassifierPerParentNode.py b/hiclass/LocalClassifierPerParentNode.py index f1873ddc..b7a8e5f8 100644 --- a/hiclass/LocalClassifierPerParentNode.py +++ b/hiclass/LocalClassifierPerParentNode.py @@ -7,7 +7,6 @@ import networkx as nx import numpy as np -import ray from sklearn.base import BaseEstimator from sklearn.utils.validation import check_array, check_is_fitted @@ -15,17 +14,6 @@ from hiclass.HierarchicalClassifier import HierarchicalClassifier -def _fit_classifier(lcppn, node): - classifier = lcppn.hierarchy_.nodes[node]["classifier"] - # get children examples - X, y = lcppn._get_successors(node) - unique_y = np.unique(y) - if len(unique_y) == 1 and lcppn.replace_classifiers: - classifier = ConstantClassifier() - classifier.fit(X, y) - return classifier - - class LocalClassifierPerParentNode(BaseEstimator, HierarchicalClassifier): """ Assign local classifiers to each parent node of the graph. @@ -200,18 +188,18 @@ def _get_successors(self, node): y = np.array(y) return X, y + @staticmethod + def _fit_classifier(self, node): + classifier = self.hierarchy_.nodes[node]["classifier"] + # get children examples + X, y = self._get_successors(node) + unique_y = np.unique(y) + if len(unique_y) == 1 and self.replace_classifiers: + classifier = ConstantClassifier() + classifier.fit(X, y) + return classifier + def _fit_digraph(self, local_mode: bool = False): self.logger_.info("Fitting local classifiers") nodes = self._get_parents() - if self.n_jobs > 1: - ray.init( - num_cpus=self.n_jobs, local_mode=local_mode, ignore_reinit_error=True - ) - lcppn = ray.put(self) - _parallel_fit = ray.remote(_fit_classifier) - results = [_parallel_fit.remote(lcppn, node) for node in nodes] - classifiers = ray.get(results) - else: - classifiers = [_fit_classifier(self, node) for node in nodes] - for classifier, node in zip(classifiers, nodes): - self.hierarchy_.nodes[node]["classifier"] = classifier + self._fit_node_classifier(nodes, local_mode) diff --git a/tests/test_HierarchicalClassifier.py b/tests/test_HierarchicalClassifier.py index e381b07c..80df330c 100644 --- a/tests/test_HierarchicalClassifier.py +++ b/tests/test_HierarchicalClassifier.py @@ -8,7 +8,6 @@ from sklearn.linear_model import LogisticRegression from hiclass.HierarchicalClassifier import HierarchicalClassifier -from hiclass.HierarchicalClassifier import _make_leveled @pytest.fixture @@ -197,7 +196,7 @@ def test_make_leveled(empty_levels): ["d", "e", "f"], ] ) - result = _make_leveled(empty_levels) + result = HierarchicalClassifier._make_leveled(empty_levels) assert_array_equal(ground_truth, result) @@ -208,4 +207,4 @@ def noniterable_y(): def test_make_leveled_non_iterable_y(noniterable_y): - assert noniterable_y == _make_leveled(noniterable_y) + assert noniterable_y == HierarchicalClassifier._make_leveled(noniterable_y) From d960ef6581e0ef6af260b8e54da6facfd02466bb Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 15:04:32 +0200 Subject: [PATCH 46/56] Add test for _fit_classifier --- tests/test_HierarchicalClassifier.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_HierarchicalClassifier.py b/tests/test_HierarchicalClassifier.py index 80df330c..3f11c66b 100644 --- a/tests/test_HierarchicalClassifier.py +++ b/tests/test_HierarchicalClassifier.py @@ -208,3 +208,8 @@ def noniterable_y(): def test_make_leveled_non_iterable_y(noniterable_y): assert noniterable_y == HierarchicalClassifier._make_leveled(noniterable_y) + + +def test_fit_classifier(): + with pytest.raises(NotImplementedError): + HierarchicalClassifier._fit_classifier(None, None) From e3f9ed6c9f5e77ec53a378e4eda906f911fd9410 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 15:23:53 +0200 Subject: [PATCH 47/56] Add black linting --- .github/workflows/black.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 00000000..b04fb15c --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,10 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable From 5d3f0fd0979befec0cc6e3f02d38d54985387f3c Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 15:25:54 +0200 Subject: [PATCH 48/56] Move black linting --- .github/workflows/black.yml | 10 ---------- .github/workflows/test-pr.yml | 7 ++++++- hiclass/LocalClassifierPerLevel.py | 4 +++- 3 files changed, 9 insertions(+), 12 deletions(-) delete mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index b04fb15c..00000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: Lint - -on: [push, pull_request] - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: psf/black@stable diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml index 03c0231f..8ffd1e1f 100644 --- a/.github/workflows/test-pr.yml +++ b/.github/workflows/test-pr.yml @@ -6,7 +6,12 @@ on: - main jobs: - build: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable + test: runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/hiclass/LocalClassifierPerLevel.py b/hiclass/LocalClassifierPerLevel.py index eac5bd6a..6db1ab94 100644 --- a/hiclass/LocalClassifierPerLevel.py +++ b/hiclass/LocalClassifierPerLevel.py @@ -168,7 +168,9 @@ def _predict_remaining_levels(self, X, y): @staticmethod def _get_successors_probability(probabilities_dict, successors): successors_probability = [ - np.array([probabilities_dict[i][successor] for successor in successors_list]) + np.array( + [probabilities_dict[i][successor] for successor in successors_list] + ) for i, successors_list in enumerate(successors) ] return successors_probability From c4d1732b50ffd82dc536c337e22462c50d9f7e56 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 15:28:00 +0200 Subject: [PATCH 49/56] Apply black --- docs/source/conf.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index b39f35cd..aecd85fd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,17 +13,18 @@ # import os import sys -sys.path.insert(0, os.path.abspath('./../..')) -sys.path.insert(0, os.path.abspath('./../../hiclass')) + +sys.path.insert(0, os.path.abspath("./../..")) +sys.path.insert(0, os.path.abspath("./../../hiclass")) print(sys.path) import sphinx_code_tabs # -- Project information ----------------------------------------------------- -project = 'hiclass' -copyright = '2022, Fabio Malcher Miranda, Niklas Köhnecke' -author = 'Fabio Malcher Miranda, Niklas Köhnecke' +project = "hiclass" +copyright = "2022, Fabio Malcher Miranda, Niklas Köhnecke" +author = "Fabio Malcher Miranda, Niklas Köhnecke" # -- General configuration --------------------------------------------------- @@ -32,15 +33,15 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.autosectionlabel', - 'sphinx_code_tabs', - 'sphinx_gallery.gen_gallery', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.autosectionlabel", + "sphinx_code_tabs", + "sphinx_gallery.gen_gallery", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -55,12 +56,13 @@ use_rtd_scheme = False try: import sphinx_rtd_theme + extensions.extend(["sphinx_rtd_theme"]) use_rtd_scheme = True except ImportError: print("sphinx_rtd_theme was not installed, using alabaster as fallback!") -html_theme = 'sphinx_rtd_theme' if use_rtd_scheme else 'alabaster' +html_theme = "sphinx_rtd_theme" if use_rtd_scheme else "alabaster" # Add any paths that contain custom static files (such as style sheets) here, @@ -76,6 +78,6 @@ html_theme_options["sidebar_width"] = "230px" sphinx_gallery_conf = { - 'examples_dirs': '../examples', - 'gallery_dirs': 'auto_examples', -} \ No newline at end of file + "examples_dirs": "../examples", + "gallery_dirs": "auto_examples", +} From da220fc7b2142734a389ff57113dc38a50909da5 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 15:30:44 +0200 Subject: [PATCH 50/56] Add black badges --- README.md | 2 +- docs/source/index.rst | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0971ab68..d47e6325 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ HiClass is an open-source Python library for hierarchical classification compatible with scikit-learn. -[![Deploy PyPI](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml/badge.svg?event=push)](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml) [![Documentation Status](https://readthedocs.org/projects/hiclass/badge/?version=latest)](https://hiclass.readthedocs.io/en/latest/?badge=latest) [![codecov](https://codecov.io/gh/mirand863/hiclass/branch/main/graph/badge.svg?token=PR8VLBMMNR)](https://codecov.io/gh/mirand863/hiclass) [![Downloads PyPI](https://static.pepy.tech/personalized-badge/hiclass?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi)](https://pypi.org/project/hiclass/) [![Downloads Conda](https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda)](https://anaconda.org/conda-forge/hiclass) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![Deploy PyPI](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml/badge.svg?event=push)](https://github.com/mirand863/hiclass/actions/workflows/deploy-pypi.yml) [![Documentation Status](https://readthedocs.org/projects/hiclass/badge/?version=latest)](https://hiclass.readthedocs.io/en/latest/?badge=latest) [![codecov](https://codecov.io/gh/mirand863/hiclass/branch/main/graph/badge.svg?token=PR8VLBMMNR)](https://codecov.io/gh/mirand863/hiclass) [![Downloads PyPI](https://static.pepy.tech/personalized-badge/hiclass?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=pypi)](https://pypi.org/project/hiclass/) [![Downloads Conda](https://img.shields.io/conda/dn/conda-forge/hiclass?label=conda)](https://anaconda.org/conda-forge/hiclass) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) ✨ Here is a **demo** that shows HiClass in action on hierarchical data: diff --git a/docs/source/index.rst b/docs/source/index.rst index 79cd177e..ccfe48cb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -30,6 +30,9 @@ Welcome to hiclass' documentation! :target: https://opensource.org/licenses/BSD-3-Clause :alt: License +.. image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/psf/black + .. toctree:: :includehidden: :maxdepth: 3 From 825009d0df3ded410b557818c42913e9e632bb3c Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 19:19:13 +0200 Subject: [PATCH 51/56] Add support for empty levels to hierarchical metrics --- hiclass/metrics.py | 39 ++++++++++++++++++++++++++------------- tests/test_metrics.py | 14 ++++++++++++++ 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/hiclass/metrics.py b/hiclass/metrics.py index 46efb9af..e91ea148 100644 --- a/hiclass/metrics.py +++ b/hiclass/metrics.py @@ -2,6 +2,17 @@ import numpy as np from sklearn.utils import check_array +from hiclass.HierarchicalClassifier import HierarchicalClassifier + + +def _validate_input(y_true, y_pred): + y_pred = HierarchicalClassifier()._make_leveled(y_pred) + y_true = HierarchicalClassifier()._make_leveled(y_true) + assert len(y_true) == len(y_pred) + y_true = check_array(y_true, dtype=None) + y_pred = check_array(y_pred, dtype=None) + return y_true, y_pred + def precision(y_true: np.ndarray, y_pred: np.ndarray): r""" @@ -24,17 +35,19 @@ def precision(y_true: np.ndarray, y_pred: np.ndarray): precision : float What proportion of positive identifications was actually correct? """ - assert len(y_true) == len(y_pred) - y_true = check_array(y_true, dtype=None) - y_pred = check_array(y_pred, dtype=None) + y_true, y_pred = _validate_input(y_true, y_pred) sum_intersection = 0 sum_prediction_and_ancestors = 0 for ground_truth, prediction in zip(y_true, y_pred): + ground_truth_set = set(ground_truth) + ground_truth_set.discard("") + prediction_set = set(prediction) + prediction_set.discard("") sum_intersection = sum_intersection + len( - set(ground_truth).intersection(set(prediction)) + ground_truth_set.intersection(prediction_set) ) sum_prediction_and_ancestors = sum_prediction_and_ancestors + len( - set(prediction) + prediction_set ) precision = sum_intersection / sum_prediction_and_ancestors return precision @@ -61,17 +74,19 @@ def recall(y_true: np.ndarray, y_pred: np.ndarray): recall : float What proportion of actual positives was identified correctly? """ - assert len(y_true) == len(y_pred) - y_true = check_array(y_true, dtype=None) - y_pred = check_array(y_pred, dtype=None) + y_true, y_pred = _validate_input(y_true, y_pred) sum_intersection = 0 sum_prediction_and_ancestors = 0 for ground_truth, prediction in zip(y_true, y_pred): + ground_truth_set = set(ground_truth) + ground_truth_set.discard("") + prediction_set = set(prediction) + prediction_set.discard("") sum_intersection = sum_intersection + len( - set(ground_truth).intersection(set(prediction)) + ground_truth_set.intersection(prediction_set) ) sum_prediction_and_ancestors = sum_prediction_and_ancestors + len( - set(ground_truth) + ground_truth_set ) recall = sum_intersection / sum_prediction_and_ancestors return recall @@ -95,9 +110,7 @@ def f1(y_true: np.ndarray, y_pred: np.ndarray): f1 : float Weighted average of the precision and recall """ - assert len(y_true) == len(y_pred) - y_true = check_array(y_true, dtype=None) - y_pred = check_array(y_pred, dtype=None) + y_true, y_pred = _validate_input(y_true, y_pred) prec = precision(y_true, y_pred) rec = recall(y_true, y_pred) f1 = 2 * prec * rec / (prec + rec) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 077e5cc7..da42a3e0 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -28,3 +28,17 @@ def test_f1(): y_true = np.array([[1, 2, 3, 4], [1, 2, 5, 6]]) y_pred = np.array([[1, 2, 5, 6], [1, 2, 3, 4]]) assert metrics.f1(y_true, y_pred) == 0.5 + + +def test_empty_levels_1(): + y_true = np.array([["2", "3"], ["1"], ["4", "5", "6"]]) + y_pred = np.array([["1", '', ''], ["2", "3", ""], ["4", "5", "6"]]) + assert metrics.f1(y_true, y_pred) == 0.5 + assert metrics.f1(y_true, y_true) == 1 + + +def test_empty_levels_2(): + y_true = np.array([["1"], ["2", "3"], ["4", "5", "6"]]) + y_pred = np.array([["1", '', ''], ["2", "3", ""], ["4", "5", "6"]]) + assert metrics.f1(y_true, y_pred) == 1 + assert metrics.f1(y_true, y_true) == 1 From 7f0b877012c40022f9b4260d1e518b1bb038cd5b Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 19:20:39 +0200 Subject: [PATCH 52/56] Apply black style --- tests/test_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index da42a3e0..4952115a 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -32,13 +32,13 @@ def test_f1(): def test_empty_levels_1(): y_true = np.array([["2", "3"], ["1"], ["4", "5", "6"]]) - y_pred = np.array([["1", '', ''], ["2", "3", ""], ["4", "5", "6"]]) + y_pred = np.array([["1", "", ""], ["2", "3", ""], ["4", "5", "6"]]) assert metrics.f1(y_true, y_pred) == 0.5 assert metrics.f1(y_true, y_true) == 1 def test_empty_levels_2(): y_true = np.array([["1"], ["2", "3"], ["4", "5", "6"]]) - y_pred = np.array([["1", '', ''], ["2", "3", ""], ["4", "5", "6"]]) + y_pred = np.array([["1", "", ""], ["2", "3", ""], ["4", "5", "6"]]) assert metrics.f1(y_true, y_pred) == 1 assert metrics.f1(y_true, y_true) == 1 From 80d516d04cd64f543b3c01aa4c3589c74bd702d9 Mon Sep 17 00:00:00 2001 From: Fabio Date: Wed, 29 Jun 2022 19:36:18 +0200 Subject: [PATCH 53/56] Refactor method make_leveled to make it public --- hiclass/HierarchicalClassifier.py | 62 ++++++++++++++++------------ hiclass/metrics.py | 6 +-- tests/test_HierarchicalClassifier.py | 6 +-- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 6777d677..96e2d16a 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -9,6 +9,40 @@ from sklearn.linear_model import LogisticRegression +def make_leveled(y): + """ + Add empty cells if columns' length differs. + + Parameters + ---------- + y : array-like of shape (n_samples, n_levels) + The target values, i.e., hierarchical class labels for classification. + + Returns + ------- + leveled_y : array-like of shape (n_samples, n_levels) + The leveled target values, i.e., hierarchical class labels for classification. + + Notes + ----- + If rows are not iterable, returns the current y without modifications. + + Examples + -------- + >>> from hiclass.HierarchicalClassifier import make_leveled + >>> y = [['a'], ['b', 'c']] + >>> make_leveled(y) + array([['a', ''], + ['b', 'c']]) + """ + try: + depth = max([len(row) for row in y]) + except TypeError: + return y + leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] + return np.array(leveled_y) + + class HierarchicalClassifier(abc.ABC): """Abstract class for the local hierarchical classifiers. @@ -87,7 +121,7 @@ def _pre_fit(self, X, y): X, y, multi_output=True, accept_sparse="csr" ) - self.y_ = self._make_leveled(self.y_) + self.y_ = make_leveled(self.y_) # Create and configure logger self._create_logger() @@ -253,32 +287,6 @@ def _remove_separator(self, y): for j in range(1, y.shape[1]): y[i, j] = y[i, j].split(self.separator_)[-1] - @staticmethod - def _make_leveled(y): - """ - Add empty cells if columns' length differs. - - Parameters - ---------- - y : array-like of shape (n_samples, n_levels) - The target values, i.e., hierarchical class labels for classification. - - Returns - ------- - leveled_y : array-like of shape (n_samples, n_levels) - The leveled target values, i.e., hierarchical class labels for classification. - - Notes - ----- - If rows are not iterable, returns the current y without modifications. - """ - try: - depth = max([len(row) for row in y]) - except TypeError: - return y - leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] - return np.array(leveled_y) - def _fit_node_classifier(self, nodes, local_mode): if self.n_jobs > 1: ray.init( diff --git a/hiclass/metrics.py b/hiclass/metrics.py index e91ea148..85c719e3 100644 --- a/hiclass/metrics.py +++ b/hiclass/metrics.py @@ -2,12 +2,12 @@ import numpy as np from sklearn.utils import check_array -from hiclass.HierarchicalClassifier import HierarchicalClassifier +from hiclass.HierarchicalClassifier import make_leveled def _validate_input(y_true, y_pred): - y_pred = HierarchicalClassifier()._make_leveled(y_pred) - y_true = HierarchicalClassifier()._make_leveled(y_true) + y_pred = make_leveled(y_pred) + y_true = make_leveled(y_true) assert len(y_true) == len(y_pred) y_true = check_array(y_true, dtype=None) y_pred = check_array(y_pred, dtype=None) diff --git a/tests/test_HierarchicalClassifier.py b/tests/test_HierarchicalClassifier.py index 3f11c66b..80e5fbd3 100644 --- a/tests/test_HierarchicalClassifier.py +++ b/tests/test_HierarchicalClassifier.py @@ -7,7 +7,7 @@ from numpy.testing import assert_array_equal from sklearn.linear_model import LogisticRegression -from hiclass.HierarchicalClassifier import HierarchicalClassifier +from hiclass.HierarchicalClassifier import HierarchicalClassifier, make_leveled @pytest.fixture @@ -196,7 +196,7 @@ def test_make_leveled(empty_levels): ["d", "e", "f"], ] ) - result = HierarchicalClassifier._make_leveled(empty_levels) + result = make_leveled(empty_levels) assert_array_equal(ground_truth, result) @@ -207,7 +207,7 @@ def noniterable_y(): def test_make_leveled_non_iterable_y(noniterable_y): - assert noniterable_y == HierarchicalClassifier._make_leveled(noniterable_y) + assert noniterable_y == make_leveled(noniterable_y) def test_fit_classifier(): From 793df1e778847929aaeba62a12e401d76b08e8a9 Mon Sep 17 00:00:00 2001 From: mirand863 Date: Wed, 29 Jun 2022 23:14:34 +0200 Subject: [PATCH 54/56] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d47e6325..cec2b2bf 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ HiClass is an open-source Python library for hierarchical classification compati - [Who is using HiClass?](#who-is-using-hiclass) - [Install](#install) - [Quick start](#quick-start) -- [Step-by-step- walk-through](#step-by-step-walk-through) +- [Step-by-step walk-through](#step-by-step-walk-through) - [API documentation](#api-documentation) - [FAQ](#faq) - [Support](#support) @@ -123,7 +123,7 @@ Here's a quick example showcasing how you can train and predict using a local cl from hiclass import LocalClassifierPerNode from sklearn.ensemble import RandomForestClassifier -# define data +# Define data X_train = [[1], [2], [3], [4]] X_test = [[4], [3], [2], [1]] Y_train = [ @@ -152,7 +152,7 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline -# define data +# Define data X_train = [ 'Struggling to repay loan', 'Unable to get annual report', From 63aac24e01cd9efcafdec77fa97464c98a6a1a1b Mon Sep 17 00:00:00 2001 From: mirand863 Date: Thu, 30 Jun 2022 14:24:55 +0200 Subject: [PATCH 55/56] Update metrics.py --- hiclass/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hiclass/metrics.py b/hiclass/metrics.py index 85c719e3..5498cbd1 100644 --- a/hiclass/metrics.py +++ b/hiclass/metrics.py @@ -6,11 +6,11 @@ def _validate_input(y_true, y_pred): - y_pred = make_leveled(y_pred) - y_true = make_leveled(y_true) assert len(y_true) == len(y_pred) y_true = check_array(y_true, dtype=None) y_pred = check_array(y_pred, dtype=None) + y_pred = make_leveled(y_pred) + y_true = make_leveled(y_true) return y_true, y_pred From df86c215a5d0115f9196cd490e66315bd80ce776 Mon Sep 17 00:00:00 2001 From: Fabio Date: Thu, 30 Jun 2022 15:26:16 +0200 Subject: [PATCH 56/56] Attempt to fix make_levelled function --- hiclass/HierarchicalClassifier.py | 1 + hiclass/metrics.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hiclass/HierarchicalClassifier.py b/hiclass/HierarchicalClassifier.py index 96e2d16a..1075877c 100644 --- a/hiclass/HierarchicalClassifier.py +++ b/hiclass/HierarchicalClassifier.py @@ -39,6 +39,7 @@ def make_leveled(y): depth = max([len(row) for row in y]) except TypeError: return y + y = np.array(y) leveled_y = [[i for i in row] + [""] * (depth - len(row)) for row in y] return np.array(leveled_y) diff --git a/hiclass/metrics.py b/hiclass/metrics.py index 5498cbd1..d8926700 100644 --- a/hiclass/metrics.py +++ b/hiclass/metrics.py @@ -7,10 +7,10 @@ def _validate_input(y_true, y_pred): assert len(y_true) == len(y_pred) - y_true = check_array(y_true, dtype=None) - y_pred = check_array(y_pred, dtype=None) y_pred = make_leveled(y_pred) y_true = make_leveled(y_true) + y_true = check_array(y_true, dtype=None) + y_pred = check_array(y_pred, dtype=None) return y_true, y_pred