Skip to content

Commit

Permalink
Merge pull request #41 from python-qds/feature/36_37_invalid_inputs_a…
Browse files Browse the repository at this point in the history
…nd_categorical

Feature/36 37 invalid inputs and categorical
  • Loading branch information
smarie committed Mar 17, 2023
2 parents 2a43dac + ace9df6 commit 28908d6
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 32 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/base.yml
Expand Up @@ -46,7 +46,8 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest ] # , macos-latest, windows-latest]
# see https://github.com/actions/setup-python/issues/544
os: [ ubuntu-20.04 ] # ubuntu-latest, macos-latest, windows-latest]
# all nox sessions: manually > dynamically from previous job
# nox_session: ["tests-2.7", "tests-3.7"]
nox_session: ${{ fromJson(needs.list_nox_test_sessions.outputs.matrix) }}
Expand Down
7 changes: 7 additions & 0 deletions docs/changelog.md
@@ -1,5 +1,12 @@
# Changelog

### 0.6.4 - Bugfixes

- Replaced usage of deprecated `scipy_mode`. Fixed [#39](https://github.com/python-qds/qdscreen/issues/39)
- Fixed `ValueError: invalid literal for int() with base 10` in `predict_qd`. Fixed [#40](https://github.com/python-qds/qdscreen/issues/40)
- Added input validators to raise human-readable error messages when the input is not correct. Fixes [#37](https://github.com/python-qds/qdscreen/issues/37)
- Fixed `AttributeError: module 'numpy' has no attribute 'object'.`. Fixes [#38](https://github.com/python-qds/qdscreen/issues/38)

### 0.6.3 - Bugfixes

- Fixed `ValueError` with recent versions of `SciPy`, due to usage of sparse arrays with object dtype. Fixes [#31](https://github.com/python-qds/qdscreen/issues/31)
Expand Down
89 changes: 69 additions & 20 deletions qdscreen/main.py
Expand Up @@ -34,13 +34,14 @@ def _add_names_to_parents_idx_series(parents):

class QDForest(object):
"""A quasi-deterministic forest returned by `qd_screen`"""
__slots__ = ('_adjmat', # a square numpy array or pandas DataFrame containing the adjacency matrix (parent->child)
'_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)
'_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables
'_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children
'stats' # an optional `Entropies` object stored for debug
)
__slots__ = (
'_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child)
'_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)
'_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables
'_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children
'stats' # an optional `Entropies` object stored for debug
)

def __init__(self,
adjmat=None, # type: Union[np.ndarray, pd.DataFrame]
Expand Down Expand Up @@ -129,13 +130,13 @@ def mask_to_indices(self, mask):
@property
def adjmat_ar(self):
"""The adjacency matrix as a 2D numpy array"""
return self.adjmat if self.is_nparray else self.adjmat.values
return self.adjmat if self.is_nparray else self.adjmat.values

@property
def adjmat(self):
"""The adjacency matrix as a pandas DataFrame or a 2D numpy array"""
if self._adjmat is None:
# compute adjmat from parents.
# compute adjmat from parents and cache it
n = self.nb_vars
adjmat = np.zeros((n, n), dtype=bool)
# from https://stackoverflow.com/a/46018613/7262247
Expand Down Expand Up @@ -543,10 +544,30 @@ def plot_increasing_entropies(self):
self.stats.plot_increasing_entropies()


def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray]
):
"""
Raises a ValueError if `df_or_array` is
:param df_or_array:
:return:
"""
if isinstance(df_or_array, pd.DataFrame):
pass
elif isinstance(df_or_array, np.ndarray):
# see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
if len(df_or_array.shape) != 2:
raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape))
else:
# Raise error
raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`")


def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
absolute_eps=None, # type: float
relative_eps=None, # type: float
keep_stats=False # type: bool
keep_stats=False, # type: bool
non_categorical_mode='strict',
):
# type: (...) -> QDForest
"""
Expand Down Expand Up @@ -574,12 +595,18 @@ def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`.
:return:
"""
# only work on the categorical features
X = get_categorical_features(X)
# Make sure this is a 2D table
assert_df_or_2D_array(X)

# sanity check
# Sanity check: are there rows in here ?
if len(X) == 0:
raise ValueError("Empty dataset provided")
raise ValueError("Provided dataset does not contain any row")

# Only work on the categorical features
X = get_categorical_features(X, non_categorical_mode=non_categorical_mode)

# Sanity check concerning the number of columns
assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing."

# parameters check and defaults
if absolute_eps is None:
Expand Down Expand Up @@ -1143,28 +1170,49 @@ def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFra
return ((cols[i], cols[j]) for i, j in zip(*res_ar))


def get_categorical_features(df_or_array # type: Union[np.ndarray, pd.DataFrame]
def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame]
non_categorical_mode="strict" # type: str
):
# type: (...) -> Union[np.ndarray, pd.DataFrame]
"""
:param df_or_array:
:param non_categorical_mode:
:return: a dataframe or array with the categorical features
"""
assert_df_or_2D_array(df_or_array)

if non_categorical_mode == "strict":
strict_mode = True
elif non_categorical_mode == "remove":
strict_mode = False
else:
raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode)

if isinstance(df_or_array, pd.DataFrame):
is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"])
if not is_categorical_dtype.any():
raise TypeError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
if strict_mode and not is_categorical_dtype.all():
raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or "
"'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to "
"`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
elif not is_categorical_dtype.any():
raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
"'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
return df_or_array.loc[:, is_categorical_dtype]

elif isinstance(df_or_array, np.ndarray):
# see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
if df_or_array.dtype.names is not None:
# structured array
is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object"
for n in df_or_array.dtype.names])
if not is_categorical_dtype.any():
raise TypeError(
if strict_mode and not is_categorical_dtype.all():
invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict()
raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): "
"found dtypes %r. This is not supported when `non_categorical_mode` is set to "
"`'strict'`" % invalid_dtypes)
elif not is_categorical_dtype.any():
raise ValueError(
"Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
"'categorical'): found dtypes %r" % df_or_array.dtype.fields)
categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype]
Expand All @@ -1176,6 +1224,7 @@ def get_categorical_features(df_or_array # type: Union[np.ndarray, pd.DataFrame
% df_or_array.dtype)
return df_or_array
else:
# Should not happen since `assert_df_or_2D_array` is called upfront now.
raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray")


Expand Down
99 changes: 89 additions & 10 deletions qdscreen/selector.py
Expand Up @@ -10,10 +10,48 @@
from .main import QDForest


class InvalidDataInputError(ValueError):
"""Raised when input data is invalid"""


def _get_most_common_value(x):
# From https://stackoverflow.com/a/47778607/7262247
# `scipy_mode` is the most robust to the various pitfalls (nans, ...)
return scipy_mode(x)[0][0]
# but they will deprecate it
# return scipy_mode(x, nan_policy=None)[0][0]
res = x.mode(dropna=True)
if len(res) == 0:
return np.nan
else:
return res


class ParentChildMapping:
__slots__ = ('_mapping_dct', '_otypes')

def __init__(
self,
mapping_dct # type: Dict
):
self._mapping_dct = mapping_dct
# Find the correct otype to use in the vectorized operation
self._otypes = [np.array(mapping_dct.values()).dtype]

def predict_child_from_parent_ar(
self,
parent_values # type: np.ndarray
):
"""For numpy"""
# apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values)

def predict_child_from_parent(
self,
parent_values # type: pd.DataFrame
):
"""For pandas"""
# See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary
return parent_values.map(self._mapping_dct)


class QDSelectorModel(object):
Expand All @@ -36,12 +74,47 @@ def __init__(self,
self.forest = qd_forest
self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]]

def fit(self,
X # type: Union[np.ndarray, pd.DataFrame]
):
def assert_valid_input(
self,
X, # type: Union[np.ndarray, pd.DataFrame]
df_extras_allowed=False # type: bool
):
"""Raises an InvalidDataInputError if X does not match the expectation"""

if self.forest.is_nparray:
if not isinstance(X, np.ndarray):
raise InvalidDataInputError(
"Input data must be an numpy array. Found: %s" % type(X))

if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]:
raise InvalidDataInputError(
"Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1]))
else:
if not isinstance(X, pd.DataFrame):
raise InvalidDataInputError(
"Input data must be a pandas DataFrame. Found: %s" % type(X))

actual = set(X.columns)
expected = set(self.forest.varnames)
if actual != expected:
missing = expected - actual
if missing or not df_extras_allowed:
extra = actual - expected
raise InvalidDataInputError(
"Input pandas DataFrame must have column names matching the ones in the model. "
"Missing: %s. Extra: %s " % (missing, extra)
)

def fit(
self,
X # type: Union[np.ndarray, pd.DataFrame]
):
"""Fits the maps able to predict determined features from others"""
forest = self.forest

# Validate the input
self.assert_valid_input(X, df_extras_allowed=False)

# we will create a sparse coordinate representation of maps
n = forest.nb_vars

Expand Down Expand Up @@ -79,8 +152,11 @@ def fit(self,
pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"])
levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value)

# Init the dict for parent if it does not exit
maps.setdefault(parent, dict())
maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict()

# Fill the parent-child item with the mapping object
maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())

else:
assert isinstance(X, pd.DataFrame)
Expand All @@ -100,8 +176,11 @@ def fit(self,
pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"])
levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value)

# Init the dict for parent if it does not exit
maps.setdefault(parent, dict())
maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict()

# Fill the parent-child item with the mapping object
maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())

def remove_qd(self,
X, # type: Union[np.ndarray, pd.DataFrame]
Expand All @@ -118,6 +197,8 @@ def remove_qd(self,
"""
forest = self.forest

self.assert_valid_input(X, df_extras_allowed=True)

is_x_nparray = isinstance(X, np.ndarray)
assert is_x_nparray == forest.is_nparray

Expand Down Expand Up @@ -187,17 +268,15 @@ def predict_qd(self,

# walk the tree from the roots
for _, parent, child in forest.walk_arcs():
# apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
X[:, child] = np.vectorize(self._maps[parent][child].__getitem__)(X[:, parent])
X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent])
else:
if not inplace:
X = X.copy()

# walk the tree from the roots
varnames = forest.varnames
for _, parent, child in forest.walk_arcs(names=False):
# apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
X.loc[:, varnames[child]] = np.vectorize(self._maps[parent][child].__getitem__)(X.loc[:, varnames[parent]])
X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]])

if not inplace:
return X
2 changes: 1 addition & 1 deletion qdscreen/sklearn.py
Expand Up @@ -89,7 +89,7 @@ def fit(self, X, y=None):
self
"""
X = self._validate_data(X, accept_sparse=False, #('csr', 'csc'),
dtype=np.object,
dtype=object,
force_all_finite='allow-nan')

# if hasattr(X, "toarray"): # sparse matrix
Expand Down
30 changes: 30 additions & 0 deletions qdscreen/tests/test_core.py
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)
# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2
import sys

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -306,4 +308,32 @@ def test_nans_in_data_sklearn():

selector = QDScreen()
Xsel = selector.fit_transform(df.to_numpy())

assert Xsel.tolist() == [['A'], ['A'], ['N']]


def test_issue_37_non_categorical():
df = pd.DataFrame({
"nb": [1, 2],
"name": ["A", "B"]
})
with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"):
qd_screen(df)


@pytest.mark.skipif(sys.version_info < (3, 6),
reason="This test is known to fail for 3.5 and 2.7, see GH#43")
def test_issue_40_nan_then_str():
df = pd.DataFrame({
"foo": ["1", "2"],
"bar": [np.nan, "B"]
})
qd_forest = qd_screen(df)
assert list(qd_forest.roots) == ["foo"]

feat_selector = qd_forest.fit_selector_model(df)
only_important_features_df = feat_selector.remove_qd(df)
assert list(only_important_features_df.columns) == ["foo"]

result = feat_selector.predict_qd(only_important_features_df)
pd.testing.assert_frame_equal(df, result)

0 comments on commit 28908d6

Please sign in to comment.