Skip to content

Commit

Permalink
add support for newer pandas sparse dataframes in frequent_patterns (#…
Browse files Browse the repository at this point in the history
…621)

* add support for newer pandas sparse dataframes in frequent_patterns

SparseDataFrame has been deprecated, pandas now recommands to create
standard DataFrames and store sparse Series as SparseArray.  This
allows to combine both dense and sparse columns.

Improve valid_input_check with sparse values; it previously uncompressed
the whole dataframe to check for invalid values, which defeats its purpose.
It now only checks existing values, which prevents memory error, and is
also much faster.

Update apriori.ipynb notebook to use the new pandas sparse DataFrame.

* use pandas version number to detect new sparse dataframes

* limitations on columns of sparse DaraFrame still apply

Contrary to what I thought in 0a8fa8f, this issue is still not fixed.

* fix bug introduced by commit 22915a8 with empty DataFrame
  • Loading branch information
dbarbier authored and rasbt committed Nov 6, 2019
1 parent 2f928cb commit fa643e2
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 22 deletions.
1 change: 1 addition & 0 deletions docs/sources/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ The CHANGELOG for the current development version is available at
##### Changes

- Improve the runtime performance for the `apriori` frequent itemset generating function when `low_memory=True`. Setting `low_memory=False` (default) is still faster for small itemsets, but `low_memory=True` can be much faster for large itemsets and requires less memory. Also, input validation for `apriori`, ̀ fpgrowth` and `fpmax` takes a significant amount of time when input pandas DataFrame is large; this is now dramatically reduced when input contains boolean values (and not zeros/ones), which is the case when using `TransactionEncoder`. ([#619](https://github.com/rasbt/mlxtend/pull/619) via [Denis Barbier](https://github.com/dbarbier))
- Add support for newer sparse pandas DataFrame for frequent itemset algorithms. Also, input validation for `apriori`, ̀ fpgrowth` and `fpmax` runs much faster on sparse DataFrame when input pandas DataFrame contains integer values. ([#621](https://github.com/rasbt/mlxtend/pull/621) via [Denis Barbier](https://github.com/dbarbier))

##### Bug Fixes
- Fixes a bug in `mlxtend.plotting.plot_pca_correlation_graph` that caused the explaind variances not summing up to 1. Also, improves the runtime performance of the correlation computation and adds a missing function argument for the explained variances (eigenvalues) if users provide their own principal components. ([#593](https://github.com/rasbt/mlxtend/issues/593) via [Gabriel Azevedo Ferreira](https://github.com/Gabriel-Azevedo-Ferreira))
Expand Down
2 changes: 1 addition & 1 deletion docs/sources/user_guide/frequent_patterns/apriori.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@
],
"source": [
"oht_ary = te.fit(dataset).transform(dataset, sparse=True)\n",
"sparse_df = pd.SparseDataFrame(oht_ary, columns=te.columns_, default_fill_value=False)\n",
"sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)\n",
"sparse_df"
]
},
Expand Down
23 changes: 20 additions & 3 deletions mlxtend/frequent_patterns/apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,11 +230,28 @@ def _support(_x, _n_rows, _is_sparse):

fpc.valid_input_check(df)

is_sparse = hasattr(df, "to_coo")
if is_sparse:
X = df.to_coo().tocsc()
# sparse attribute exists for both deprecated SparseDataFrame and
# DataFrame with SparseArray (pandas >= 0.24); to_coo attribute
# exists only for the former, thus it is checked first to distinguish
# between SparseDataFrame and DataFrame with SparseArray.
if hasattr(df, "to_coo"):
# SparseDataFrame with pandas < 0.24
if df.size == 0:
X = df.values
else:
X = df.to_coo().tocsc()
is_sparse = True
elif hasattr(df, "sparse"):
# DataFrame with SparseArray (pandas >= 0.24)
if df.size == 0:
X = df.values
else:
X = df.sparse.to_coo().tocsc()
is_sparse = True
else:
# dense DataFrame
X = df.values
is_sparse = False
support = _support(X, X.shape[0], is_sparse)
ary_col_idx = np.arange(X.shape[1])
support_dict = {1: support[support >= min_support]}
Expand Down
46 changes: 34 additions & 12 deletions mlxtend/frequent_patterns/fpcommon.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pandas as pd
import collections
from distutils.version import LooseVersion as Version
from pandas import __version__ as pandas_version


def setup_fptree(df, min_support):
Expand Down Expand Up @@ -48,25 +50,45 @@ def generate_itemsets(generator, num_itemsets, colname_map):


def valid_input_check(df):
# Fast path: if all columns are boolean, there is nothing to check
if not (df.dtypes == bool).all():
# Pandas is much slower than numpy, so use df.values instead of df here
idxs = np.where((df.values != 1) & (df.values != 0))
if len(idxs[0]) > 0:
val = df.values[idxs[0][0], idxs[1][0]]
s = ('The allowed values for a DataFrame'
' are True, False, 0, 1. Found value %s' % (val))
raise ValueError(s)

is_sparse = hasattr(df, "to_coo")
if is_sparse:
if df.size == 0:
return
if hasattr(df, "to_coo") or hasattr(df, "sparse"):
if not isinstance(df.columns[0], str) and df.columns[0] != 0:
raise ValueError('Due to current limitations in Pandas, '
'if the SparseDataFrame has integer column names,'
'names, please make sure they either start '
'with `0` or cast them as string column names: '
'`df.columns = [str(i) for i in df.columns`].')

# Fast path: if all columns are boolean, there is nothing to check
if Version(pandas_version) >= Version("0.24"):
all_bools = ((df.dtypes == pd.SparseDtype(bool)) |
(df.dtypes == bool)).all()
else:
all_bools = (df.dtypes == bool).all()
if not all_bools:
# Pandas is much slower than numpy, so use np.where on Numpy arrays
if hasattr(df, "to_coo"):
# see comment in apriori.py, to_coo attribute must be checked first
if df.size == 0:
values = df.values
else:
values = df.to_coo().tocoo().data
elif hasattr(df, "sparse"):
if df.size == 0:
values = df.values
else:
values = df.sparse.to_coo().tocoo().data
else:
values = df.values
idxs = np.where((values != 1) & (values != 0))
if len(idxs[0]) > 0:
# idxs has 1 dimension with sparse data and 2 with dense data
val = values[tuple(loc[0] for loc in idxs)]
s = ('The allowed values for a DataFrame'
' are True, False, 0, 1. Found value %s' % (val))
raise ValueError(s)


class FPTree(object):
def __init__(self, rank=None):
Expand Down
42 changes: 36 additions & 6 deletions mlxtend/frequent_patterns/tests/test_fpbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from mlxtend.utils import assert_raises
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
from pandas import __version__ as pandas_version
from distutils.version import LooseVersion as Version
import sys
from contextlib import contextmanager
from io import StringIO
Expand Down Expand Up @@ -69,13 +71,19 @@ def test_itemsets_type(self):
assert isinstance(i, frozenset) is True

def test_raise_error_if_input_is_not_binary(self):
def test_with_dataframe(df):
assert_raises(ValueError,
'The allowed values for a DataFrame are True, '
'False, 0, 1. Found value 2',
self.fpalgo, df)
df2 = pd.DataFrame(self.one_ary, columns=self.cols).copy()
df2.iloc[3, 3] = 2

assert_raises(ValueError,
'The allowed values for a DataFrame are True, '
'False, 0, 1. Found value 2',
self.fpalgo, df2)
test_with_dataframe(df2)
sdf = df2.to_sparse()
test_with_dataframe(sdf)
if Version(pandas_version) >= Version("0.24"):
sdf2 = df2.astype(pd.SparseDtype(int, fill_value=0))
test_with_dataframe(sdf2)

def test_sparsedataframe_notzero_column(self):
dfs = pd.SparseDataFrame(self.df)
Expand Down Expand Up @@ -131,7 +139,7 @@ def test_frozenset_selection(self):
== frozenset(('Kidney Beans', 'Milk'))].values.shape \
== (1, 2)

def test_sparse(self):
def test_sparse_deprecated(self):
def test_with_fill_values(fill_value):
sdf = self.df.to_sparse(fill_value=fill_value)
res_df = self.fpalgo(sdf, use_colnames=True)
Expand All @@ -149,6 +157,28 @@ def test_with_fill_values(fill_value):
test_with_fill_values(0)
test_with_fill_values(False)

def test_sparse(self):
if Version(pandas_version) < Version("0.24"):
return

def test_with_fill_values(fill_value):
sdt = pd.SparseDtype(type(fill_value), fill_value=fill_value)
sdf = self.df.astype(sdt)
res_df = self.fpalgo(sdf, use_colnames=True)
assert res_df.values.shape == self.fpalgo(self.df).values.shape
assert res_df[res_df['itemsets']
== 'nothing'].values.shape == (0, 2)
assert res_df[res_df['itemsets']
== {'Milk', 'Kidney Beans'}].values.shape == (1, 2)
assert res_df[res_df['itemsets'] ==
frozenset(('Milk', 'Kidney Beans'))].values.shape \
== (1, 2)
assert res_df[res_df['itemsets'] ==
frozenset(('Kidney Beans', 'Milk'))].values.shape \
== (1, 2)
test_with_fill_values(0)
test_with_fill_values(False)


class FPTestEx1All(FPTestEx1):
def setUp(self, fpalgo, one_ary=None):
Expand Down

0 comments on commit fa643e2

Please sign in to comment.