add support for newer pandas sparse dataframes in frequent_patterns (#…

…621) * add support for newer pandas sparse dataframes in frequent_patterns SparseDataFrame has been deprecated, pandas now recommands to create standard DataFrames and store sparse Series as SparseArray. This allows to combine both dense and sparse columns. Improve valid_input_check with sparse values; it previously uncompressed the whole dataframe to check for invalid values, which defeats its purpose. It now only checks existing values, which prevents memory error, and is also much faster. Update apriori.ipynb notebook to use the new pandas sparse DataFrame. * use pandas version number to detect new sparse dataframes * limitations on columns of sparse DaraFrame still apply Contrary to what I thought in 0a8fa8f, this issue is still not fixed. * fix bug introduced by commit 22915a8 with empty DataFrame
rasbt · Nov 6, 2019 · fa643e2 · fa643e2
1 parent 2f928cb
commit fa643e2
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 22 deletions.
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -22,6 +22,7 @@ The CHANGELOG for the current development version is available at
 ##### Changes
 
 - Improve the runtime performance for the `apriori` frequent itemset generating function when `low_memory=True`. Setting `low_memory=False` (default) is still faster for small itemsets, but `low_memory=True` can be much faster for large itemsets and requires less memory.  Also, input validation for  `apriori`, ̀ fpgrowth` and `fpmax` takes a significant amount of time when input pandas DataFrame is large; this is now dramatically reduced when input contains boolean values (and not zeros/ones), which is the case when using `TransactionEncoder`. ([#619](https://github.com/rasbt/mlxtend/pull/619) via [Denis Barbier](https://github.com/dbarbier))
+- Add support for newer sparse pandas DataFrame for frequent itemset algorithms. Also, input validation for  `apriori`, ̀ fpgrowth` and `fpmax` runs much faster on sparse DataFrame when input pandas DataFrame contains integer values. ([#621](https://github.com/rasbt/mlxtend/pull/621) via [Denis Barbier](https://github.com/dbarbier))
 
 ##### Bug Fixes
 - Fixes a bug in `mlxtend.plotting.plot_pca_correlation_graph` that caused the explaind variances not summing up to 1. Also, improves the runtime performance of the correlation computation and adds a missing function argument for the explained variances (eigenvalues) if users provide their own principal components. ([#593](https://github.com/rasbt/mlxtend/issues/593) via [Gabriel Azevedo Ferreira](https://github.com/Gabriel-Azevedo-Ferreira))

diff --git a/docs/sources/user_guide/frequent_patterns/apriori.ipynb b/docs/sources/user_guide/frequent_patterns/apriori.ipynb
@@ -913,7 +913,7 @@
    ],
    "source": [
     "oht_ary = te.fit(dataset).transform(dataset, sparse=True)\n",
-    "sparse_df = pd.SparseDataFrame(oht_ary, columns=te.columns_, default_fill_value=False)\n",
+    "sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)\n",
     "sparse_df"
    ]
   },

diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py
@@ -230,11 +230,28 @@ def _support(_x, _n_rows, _is_sparse):
 
     fpc.valid_input_check(df)
 
-    is_sparse = hasattr(df, "to_coo")
-    if is_sparse:
-        X = df.to_coo().tocsc()
+    # sparse attribute exists for both deprecated SparseDataFrame and
+    # DataFrame with SparseArray (pandas >= 0.24); to_coo attribute
+    # exists only for the former, thus it is checked first to distinguish
+    # between SparseDataFrame and DataFrame with SparseArray.
+    if hasattr(df, "to_coo"):
+        # SparseDataFrame with pandas < 0.24
+        if df.size == 0:
+            X = df.values
+        else:
+            X = df.to_coo().tocsc()
+        is_sparse = True
+    elif hasattr(df, "sparse"):
+        # DataFrame with SparseArray (pandas >= 0.24)
+        if df.size == 0:
+            X = df.values
+        else:
+            X = df.sparse.to_coo().tocsc()
+        is_sparse = True
     else:
+        # dense DataFrame
         X = df.values
+        is_sparse = False
     support = _support(X, X.shape[0], is_sparse)
     ary_col_idx = np.arange(X.shape[1])
     support_dict = {1: support[support >= min_support]}

diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
 import collections
+from distutils.version import LooseVersion as Version
+from pandas import __version__ as pandas_version
 
 
 def setup_fptree(df, min_support):
@@ -48,25 +50,45 @@ def generate_itemsets(generator, num_itemsets, colname_map):
 
 
 def valid_input_check(df):
-    # Fast path: if all columns are boolean, there is nothing to check
-    if not (df.dtypes == bool).all():
-        # Pandas is much slower than numpy, so use df.values instead of df here
-        idxs = np.where((df.values != 1) & (df.values != 0))
-        if len(idxs[0]) > 0:
-            val = df.values[idxs[0][0], idxs[1][0]]
-            s = ('The allowed values for a DataFrame'
-                 ' are True, False, 0, 1. Found value %s' % (val))
-            raise ValueError(s)
-
-    is_sparse = hasattr(df, "to_coo")
-    if is_sparse:
+    if df.size == 0:
+        return
+    if hasattr(df, "to_coo") or hasattr(df, "sparse"):
         if not isinstance(df.columns[0], str) and df.columns[0] != 0:
             raise ValueError('Due to current limitations in Pandas, '
                              'if the SparseDataFrame has integer column names,'
                              'names, please make sure they either start '
                              'with `0` or cast them as string column names: '
                              '`df.columns = [str(i) for i in df.columns`].')
 
+    # Fast path: if all columns are boolean, there is nothing to check
+    if Version(pandas_version) >= Version("0.24"):
+        all_bools = ((df.dtypes == pd.SparseDtype(bool)) |
+                     (df.dtypes == bool)).all()
+    else:
+        all_bools = (df.dtypes == bool).all()
+    if not all_bools:
+        # Pandas is much slower than numpy, so use np.where on Numpy arrays
+        if hasattr(df, "to_coo"):
+            # see comment in apriori.py, to_coo attribute must be checked first
+            if df.size == 0:
+                values = df.values
+            else:
+                values = df.to_coo().tocoo().data
+        elif hasattr(df, "sparse"):
+            if df.size == 0:
+                values = df.values
+            else:
+                values = df.sparse.to_coo().tocoo().data
+        else:
+            values = df.values
+        idxs = np.where((values != 1) & (values != 0))
+        if len(idxs[0]) > 0:
+            # idxs has 1 dimension with sparse data and 2 with dense data
+            val = values[tuple(loc[0] for loc in idxs)]
+            s = ('The allowed values for a DataFrame'
+                 ' are True, False, 0, 1. Found value %s' % (val))
+            raise ValueError(s)
+
 
 class FPTree(object):
     def __init__(self, rank=None):

diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py
@@ -9,6 +9,8 @@
 from mlxtend.utils import assert_raises
 from mlxtend.preprocessing import TransactionEncoder
 import pandas as pd
+from pandas import __version__ as pandas_version
+from distutils.version import LooseVersion as Version
 import sys
 from contextlib import contextmanager
 from io import StringIO
@@ -69,13 +71,19 @@ def test_itemsets_type(self):
             assert isinstance(i, frozenset) is True
 
     def test_raise_error_if_input_is_not_binary(self):
+        def test_with_dataframe(df):
+            assert_raises(ValueError,
+                          'The allowed values for a DataFrame are True, '
+                          'False, 0, 1. Found value 2',
+                          self.fpalgo, df)
         df2 = pd.DataFrame(self.one_ary, columns=self.cols).copy()
         df2.iloc[3, 3] = 2
-
-        assert_raises(ValueError,
-                      'The allowed values for a DataFrame are True, '
-                      'False, 0, 1. Found value 2',
-                      self.fpalgo, df2)
+        test_with_dataframe(df2)
+        sdf = df2.to_sparse()
+        test_with_dataframe(sdf)
+        if Version(pandas_version) >= Version("0.24"):
+            sdf2 = df2.astype(pd.SparseDtype(int, fill_value=0))
+            test_with_dataframe(sdf2)
 
     def test_sparsedataframe_notzero_column(self):
         dfs = pd.SparseDataFrame(self.df)
@@ -131,7 +139,7 @@ def test_frozenset_selection(self):
                       == frozenset(('Kidney Beans', 'Milk'))].values.shape \
             == (1, 2)
 
-    def test_sparse(self):
+    def test_sparse_deprecated(self):
         def test_with_fill_values(fill_value):
             sdf = self.df.to_sparse(fill_value=fill_value)
             res_df = self.fpalgo(sdf, use_colnames=True)
@@ -149,6 +157,28 @@ def test_with_fill_values(fill_value):
         test_with_fill_values(0)
         test_with_fill_values(False)
 
+    def test_sparse(self):
+        if Version(pandas_version) < Version("0.24"):
+            return
+
+        def test_with_fill_values(fill_value):
+            sdt = pd.SparseDtype(type(fill_value), fill_value=fill_value)
+            sdf = self.df.astype(sdt)
+            res_df = self.fpalgo(sdf, use_colnames=True)
+            assert res_df.values.shape == self.fpalgo(self.df).values.shape
+            assert res_df[res_df['itemsets']
+                          == 'nothing'].values.shape == (0, 2)
+            assert res_df[res_df['itemsets']
+                          == {'Milk', 'Kidney Beans'}].values.shape == (1, 2)
+            assert res_df[res_df['itemsets'] ==
+                          frozenset(('Milk', 'Kidney Beans'))].values.shape \
+                == (1, 2)
+            assert res_df[res_df['itemsets'] ==
+                          frozenset(('Kidney Beans', 'Milk'))].values.shape \
+                == (1, 2)
+        test_with_fill_values(0)
+        test_with_fill_values(False)
+
 
 class FPTestEx1All(FPTestEx1):
     def setUp(self, fpalgo, one_ary=None):