From 3db55e8d63859eb8cf81c95be744c112e41e94b1 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 18:00:01 +0100 Subject: [PATCH 1/9] Minor edits --- qdscreen/main.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/qdscreen/main.py b/qdscreen/main.py index 85ee03d..13b7df8 100644 --- a/qdscreen/main.py +++ b/qdscreen/main.py @@ -34,13 +34,14 @@ def _add_names_to_parents_idx_series(parents): class QDForest(object): """A quasi-deterministic forest returned by `qd_screen`""" - __slots__ = ('_adjmat', # a square numpy array or pandas DataFrame containing the adjacency matrix (parent->child) - '_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root - 'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe) - '_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables - '_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children - 'stats' # an optional `Entropies` object stored for debug - ) + __slots__ = ( + '_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child) + '_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root + 'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe) + '_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables + '_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children + 'stats' # an optional `Entropies` object stored for debug + ) def __init__(self, adjmat=None, # type: Union[np.ndarray, pd.DataFrame] @@ -129,13 +130,13 @@ def mask_to_indices(self, mask): @property def adjmat_ar(self): """The adjacency matrix as a 2D numpy array""" - return self.adjmat if self.is_nparray else self.adjmat.values + return self.adjmat if self.is_nparray else self.adjmat.values @property def adjmat(self): """The adjacency matrix as a pandas DataFrame or a 2D numpy array""" if self._adjmat is None: - # compute adjmat from parents. + # compute adjmat from parents and cache it n = self.nb_vars adjmat = np.zeros((n, n), dtype=bool) # from https://stackoverflow.com/a/46018613/7262247 From eb095833c6d83b1a0bc45c02f4c209edd50edfab Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 18:02:27 +0100 Subject: [PATCH 2/9] Fixed `AttributeError: module 'numpy' has no attribute 'object'.`. Fixed #38 --- qdscreen/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qdscreen/sklearn.py b/qdscreen/sklearn.py index 237842b..97570a0 100644 --- a/qdscreen/sklearn.py +++ b/qdscreen/sklearn.py @@ -89,7 +89,7 @@ def fit(self, X, y=None): self """ X = self._validate_data(X, accept_sparse=False, #('csr', 'csc'), - dtype=np.object, + dtype=object, force_all_finite='allow-nan') # if hasattr(X, "toarray"): # sparse matrix From 996f1f7fe3235574c727a0b74908dc5a2da2d323 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 18:18:08 +0100 Subject: [PATCH 3/9] Added a `non_categorical_mode` argument to `qd_screen` and to `get_categorical_features` and added input validators. Fixed #37 --- qdscreen/main.py | 70 +++++++++++++++++++++++++++++++++++++------- qdscreen/selector.py | 47 +++++++++++++++++++++++++++-- 2 files changed, 103 insertions(+), 14 deletions(-) diff --git a/qdscreen/main.py b/qdscreen/main.py index 13b7df8..e6cdf8f 100644 --- a/qdscreen/main.py +++ b/qdscreen/main.py @@ -544,10 +544,30 @@ def plot_increasing_entropies(self): self.stats.plot_increasing_entropies() -def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray] +def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray] + ): + """ + Raises a ValueError if `df_or_array` is + + :param df_or_array: + :return: + """ + if isinstance(df_or_array, pd.DataFrame): + pass + elif isinstance(df_or_array, np.ndarray): + # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes + if len(df_or_array.shape) != 2: + raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape)) + else: + # Raise error + raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`") + + +def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray] absolute_eps=None, # type: float relative_eps=None, # type: float - keep_stats=False # type: bool + keep_stats=False, # type: bool + non_categorical_mode='strict', ): # type: (...) -> QDForest """ @@ -575,12 +595,18 @@ def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray] memory in the resulting forest object (`.stats`), for further analysis. By default this is `False`. :return: """ - # only work on the categorical features - X = get_categorical_features(X) + # Make sure this is a 2D table + assert_df_or_2D_array(X) - # sanity check + # Sanity check: are there rows in here ? if len(X) == 0: - raise ValueError("Empty dataset provided") + raise ValueError("Provided dataset does not contain any row") + + # Only work on the categorical features + X = get_categorical_features(X, non_categorical_mode=non_categorical_mode) + + # Sanity check concerning the number of columns + assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing." # parameters check and defaults if absolute_eps is None: @@ -1144,28 +1170,49 @@ def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFra return ((cols[i], cols[j]) for i, j in zip(*res_ar)) -def get_categorical_features(df_or_array # type: Union[np.ndarray, pd.DataFrame] +def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame] + non_categorical_mode="strict" # type: str ): # type: (...) -> Union[np.ndarray, pd.DataFrame] """ :param df_or_array: + :param non_categorical_mode: :return: a dataframe or array with the categorical features """ + assert_df_or_2D_array(df_or_array) + + if non_categorical_mode == "strict": + strict_mode = True + elif non_categorical_mode == "remove": + strict_mode = False + else: + raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode) + if isinstance(df_or_array, pd.DataFrame): is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"]) - if not is_categorical_dtype.any(): - raise TypeError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or " + if strict_mode and not is_categorical_dtype.all(): + raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or " + "'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to " + "`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict()) + elif not is_categorical_dtype.any(): + raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or " "'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict()) return df_or_array.loc[:, is_categorical_dtype] + elif isinstance(df_or_array, np.ndarray): # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes if df_or_array.dtype.names is not None: # structured array is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object" for n in df_or_array.dtype.names]) - if not is_categorical_dtype.any(): - raise TypeError( + if strict_mode and not is_categorical_dtype.all(): + invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict() + raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): " + "found dtypes %r. This is not supported when `non_categorical_mode` is set to " + "`'strict'`" % invalid_dtypes) + elif not is_categorical_dtype.any(): + raise ValueError( "Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or " "'categorical'): found dtypes %r" % df_or_array.dtype.fields) categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype] @@ -1177,6 +1224,7 @@ def get_categorical_features(df_or_array # type: Union[np.ndarray, pd.DataFrame % df_or_array.dtype) return df_or_array else: + # Should not happen since `assert_df_or_2D_array` is called upfront now. raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray") diff --git a/qdscreen/selector.py b/qdscreen/selector.py index c104600..02c4a45 100644 --- a/qdscreen/selector.py +++ b/qdscreen/selector.py @@ -10,6 +10,10 @@ from .main import QDForest +class InvalidDataInputError(ValueError): + """Raised when input data is invalid""" + + def _get_most_common_value(x): # From https://stackoverflow.com/a/47778607/7262247 # `scipy_mode` is the most robust to the various pitfalls (nans, ...) @@ -36,12 +40,47 @@ def __init__(self, self.forest = qd_forest self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]] - def fit(self, - X # type: Union[np.ndarray, pd.DataFrame] - ): + def assert_valid_input( + self, + X, # type: Union[np.ndarray, pd.DataFrame] + df_extras_allowed=False # type: bool + ): + """Raises an InvalidDataInputError if X does not match the expectation""" + + if self.forest.is_nparray: + if not isinstance(X, np.ndarray): + raise InvalidDataInputError( + "Input data must be an numpy array. Found: %s" % type(X)) + + if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]: + raise InvalidDataInputError( + "Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1])) + else: + if not isinstance(X, pd.DataFrame): + raise InvalidDataInputError( + "Input data must be a pandas DataFrame. Found: %s" % type(X)) + + actual = set(X.columns) + expected = set(self.forest.varnames) + if actual != expected: + missing = expected - actual + if missing or not df_extras_allowed: + extra = actual - expected + raise InvalidDataInputError( + "Input pandas DataFrame must have column names matching the ones in the model. " + "Missing: %s. Extra: %s " % (missing, extra) + ) + + def fit( + self, + X # type: Union[np.ndarray, pd.DataFrame] + ): """Fits the maps able to predict determined features from others""" forest = self.forest + # Validate the input + self.assert_valid_input(X, df_extras_allowed=False) + # we will create a sparse coordinate representation of maps n = forest.nb_vars @@ -118,6 +157,8 @@ def remove_qd(self, """ forest = self.forest + self.assert_valid_input(X, df_extras_allowed=True) + is_x_nparray = isinstance(X, np.ndarray) assert is_x_nparray == forest.is_nparray From 5760cd581688af13d76d6225bab64ebc6fcbb0d6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 18:40:10 +0100 Subject: [PATCH 4/9] Fixed `ValueError: invalid literal for int() with base 10` in `predict_qd`. Fixed #40 Replaced usage of deprecated `scipy_mode`. Fixed #39 --- qdscreen/selector.py | 52 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/qdscreen/selector.py b/qdscreen/selector.py index 02c4a45..e6d66a2 100644 --- a/qdscreen/selector.py +++ b/qdscreen/selector.py @@ -17,7 +17,41 @@ class InvalidDataInputError(ValueError): def _get_most_common_value(x): # From https://stackoverflow.com/a/47778607/7262247 # `scipy_mode` is the most robust to the various pitfalls (nans, ...) - return scipy_mode(x)[0][0] + # but they will deprecate it + # return scipy_mode(x, nan_policy=None)[0][0] + res = x.mode(dropna=True) + if len(res) == 0: + return np.nan + else: + return res + + +class ParentChildMapping: + __slots__ = ('_mapping_dct', '_otypes') + + def __init__( + self, + mapping_dct # type: Dict + ): + self._mapping_dct = mapping_dct + # Find the correct otype to use in the vectorized operation + self._otypes = [np.array(mapping_dct.values()).dtype] + + def predict_child_from_parent_ar( + self, + parent_values # type: np.ndarray + ): + """For numpy""" + # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247 + return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values) + + def predict_child_from_parent( + self, + parent_values # type: pd.DataFrame + ): + """For pandas""" + # See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary + return parent_values.map(self._mapping_dct) class QDSelectorModel(object): @@ -118,8 +152,11 @@ def fit( pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"]) levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value) + # Init the dict for parent if it does not exit maps.setdefault(parent, dict()) - maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict() + + # Fill the parent-child item with the mapping object + maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict()) else: assert isinstance(X, pd.DataFrame) @@ -139,8 +176,11 @@ def fit( pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"]) levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value) + # Init the dict for parent if it does not exit maps.setdefault(parent, dict()) - maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict() + + # Fill the parent-child item with the mapping object + maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict()) def remove_qd(self, X, # type: Union[np.ndarray, pd.DataFrame] @@ -228,8 +268,7 @@ def predict_qd(self, # walk the tree from the roots for _, parent, child in forest.walk_arcs(): - # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247 - X[:, child] = np.vectorize(self._maps[parent][child].__getitem__)(X[:, parent]) + X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent]) else: if not inplace: X = X.copy() @@ -237,8 +276,7 @@ def predict_qd(self, # walk the tree from the roots varnames = forest.varnames for _, parent, child in forest.walk_arcs(names=False): - # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247 - X.loc[:, varnames[child]] = np.vectorize(self._maps[parent][child].__getitem__)(X.loc[:, varnames[parent]]) + X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]]) if not inplace: return X From b8271662d6cf8110efbb862aa0afae6002bbe5ac Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 21:22:06 +0100 Subject: [PATCH 5/9] Added tests for issues 37 and 40 --- qdscreen/tests/test_core.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/qdscreen/tests/test_core.py b/qdscreen/tests/test_core.py index 9bcebf3..90167fa 100644 --- a/qdscreen/tests/test_core.py +++ b/qdscreen/tests/test_core.py @@ -306,4 +306,27 @@ def test_nans_in_data_sklearn(): selector = QDScreen() Xsel = selector.fit_transform(df.to_numpy()) + assert Xsel.tolist() == [['A'], ['A'], ['N']] + + +def test_issue_37_non_categorical(): + df = pd.DataFrame({ + "nb": [1, 2], + "name": ["A", "B"] + }) + with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"): + qd_screen(df) + + +def test_issue_40_nan_then_str(): + df = pd.DataFrame({ + "foo": ["1", "2"], + "bar": [np.nan, "B"] + }) + qd_forest = qd_screen(df) + feat_selector = qd_forest.fit_selector_model(df) + only_important_features_df = feat_selector.remove_qd(df) + result = feat_selector.predict_qd(only_important_features_df) + + pd.testing.assert_frame_equal(df, result) From 082a6f184267c833d873b11208b2edc4e1b4ab53 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 21:23:10 +0100 Subject: [PATCH 6/9] 0.6.4 changelog --- docs/changelog.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index b661387..db88c66 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,12 @@ # Changelog +### 0.6.4 - Bugfixes + + - Replaced usage of deprecated `scipy_mode`. Fixed [#39](https://github.com/python-qds/qdscreen/issues/39) + - Fixed `ValueError: invalid literal for int() with base 10` in `predict_qd`. Fixed [#40](https://github.com/python-qds/qdscreen/issues/40) + - Added input validators to raise human-readable error messages when the input is not correct. Fixes [#37](https://github.com/python-qds/qdscreen/issues/37) + - Fixed `AttributeError: module 'numpy' has no attribute 'object'.`. Fixes [#38](https://github.com/python-qds/qdscreen/issues/38) + ### 0.6.3 - Bugfixes - Fixed `ValueError` with recent versions of `SciPy`, due to usage of sparse arrays with object dtype. Fixes [#31](https://github.com/python-qds/qdscreen/issues/31) From e82dabc347746a3b955d53cf2a946fcf1730a35a Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 21:33:45 +0100 Subject: [PATCH 7/9] Fixed CI for 2.7 3.5 and 3.6. Fixes #42 --- .github/workflows/base.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 550c9f4..4189358 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -46,7 +46,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-latest ] # , macos-latest, windows-latest] + # see https://github.com/actions/setup-python/issues/544 + os: [ ubuntu-20.04 ] # ubuntu-latest, macos-latest, windows-latest] # all nox sessions: manually > dynamically from previous job # nox_session: ["tests-2.7", "tests-3.7"] nox_session: ${{ fromJson(needs.list_nox_test_sessions.outputs.matrix) }} From 558803d5784c0fec19219118747126d5319d5eb6 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Thu, 16 Mar 2023 21:56:24 +0100 Subject: [PATCH 8/9] Improved explanation about the bug on 2.7 and 3.5 --- qdscreen/tests/test_core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qdscreen/tests/test_core.py b/qdscreen/tests/test_core.py index 90167fa..566216a 100644 --- a/qdscreen/tests/test_core.py +++ b/qdscreen/tests/test_core.py @@ -325,8 +325,11 @@ def test_issue_40_nan_then_str(): "bar": [np.nan, "B"] }) qd_forest = qd_screen(df) + assert list(qd_forest.roots) == ["foo"] + feat_selector = qd_forest.fit_selector_model(df) only_important_features_df = feat_selector.remove_qd(df) - result = feat_selector.predict_qd(only_important_features_df) + assert list(only_important_features_df.columns) == ["foo"] + result = feat_selector.predict_qd(only_important_features_df) pd.testing.assert_frame_equal(df, result) From ace9df6dca7fb54c1c79245c0438e7fddb2d4270 Mon Sep 17 00:00:00 2001 From: Sylvain MARIE Date: Fri, 17 Mar 2023 09:54:25 +0100 Subject: [PATCH 9/9] Skipping failing test until #43 is fixed --- qdscreen/tests/test_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/qdscreen/tests/test_core.py b/qdscreen/tests/test_core.py index 566216a..22aecc1 100644 --- a/qdscreen/tests/test_core.py +++ b/qdscreen/tests/test_core.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- # the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments) # from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2 +import sys + import numpy as np import pandas as pd import pytest @@ -319,6 +321,8 @@ def test_issue_37_non_categorical(): qd_screen(df) +@pytest.mark.skipif(sys.version_info < (3, 6), + reason="This test is known to fail for 3.5 and 2.7, see GH#43") def test_issue_40_nan_then_str(): df = pd.DataFrame({ "foo": ["1", "2"],