rasbt · dbarbier · Dec 15, 2019 · Dec 18, 2019 · Dec 19, 2019 · Jan 2, 2020
diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py
@@ -14,24 +14,24 @@ def generate_new_combinations(old_combinations):
     Generator of all combinations based on the last state of Apriori algorithm
     Parameters
     -----------
-    old_combinations: np.array
+    old_combinations: list of tuples
         All combinations with enough support in the last step
-        Combinations are represented by a matrix.
-        Number of columns is equal to the combination size
+        Combinations are represented by a list of tuples.
+        All tuples have the same length, which is equal to the combination size
         of the previous step.
-        Each row represents one combination
+        Each tuple represents one combination
         and contains item type ids in the ascending order
         ```
-               0        1
-        0      15       20
-        1      15       22
-        2      17       19
+           15       20
+           15       22
+           17       19
         ```
 
     Returns
     -----------
-    Generator of all combinations from the last step x items
-    from the previous step.
+    Generator of combinations based on the last state of Apriori algorithm.
+    In order to reduce number of candidates, this function implements the
+    apriori-gen function described in section 2.1.1 of Apriori paper.
 
     Examples
     -----------
@@ -40,95 +40,28 @@ def generate_new_combinations(old_combinations):
 
     """
 
-    items_types_in_previous_step = np.unique(old_combinations.flatten())
-    for old_combination in old_combinations:
-        max_combination = old_combination[-1]
-        mask = items_types_in_previous_step > max_combination
-        valid_items = items_types_in_previous_step[mask]
-        old_tuple = tuple(old_combination)
-        for item in valid_items:
-            yield from old_tuple
-            yield item
-
-
-def generate_new_combinations_low_memory(old_combinations, X, min_support,
-                                         is_sparse):
-    """
-    Generator of all combinations based on the last state of Apriori algorithm
-    Parameters
-    -----------
-    old_combinations: np.array
-        All combinations with enough support in the last step
-        Combinations are represented by a matrix.
-        Number of columns is equal to the combination size
-        of the previous step.
-        Each row represents one combination
-        and contains item type ids in the ascending order
-        ```
-               0        1
-        0      15       20
-        1      15       22
-        2      17       19
-        ```
-
-    X: np.array or scipy sparse matrix
-      The allowed values are either 0/1 or True/False.
-      For example,
-
-    ```
-        0     True False  True  True False  True
-        1     True False  True False False  True
-        2     True False  True False False False
-        3     True  True False False False False
-        4    False False  True  True  True  True
-        5    False False  True False  True  True
-        6    False False  True False  True False
-        7     True  True False False False False
-    ```
-
-    min_support : float (default: 0.5)
-      A float between 0 and 1 for minumum support of the itemsets returned.
-      The support is computed as the fraction
-      `transactions_where_item(s)_occur / total_transactions`.
-
-    is_sparse : bool True if X is sparse
-
-    Returns
-    -----------
-    Generator of all combinations from the last step x items
-    from the previous step. Every combination contains the
-    number of transactions where this item occurs, followed
-    by item type ids in the ascending order.
-    No combination other than generated
-    do not have a chance to get enough support
-
-    Examples
-    -----------
-    For usage examples, please see
-    http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
-
-    """
-
-    items_types_in_previous_step = np.unique(old_combinations.flatten())
-    rows_count = X.shape[0]
-    threshold = min_support * rows_count
-    for old_combination in old_combinations:
-        max_combination = old_combination[-1]
-        mask = items_types_in_previous_step > max_combination
-        valid_items = items_types_in_previous_step[mask]
-        old_tuple = tuple(old_combination)
-        if is_sparse:
-            mask_rows = X[:, old_tuple].toarray().all(axis=1)
-            X_cols = X[:, valid_items].toarray()
-            supports = X_cols[mask_rows].sum(axis=0)
-        else:
-            mask_rows = X[:, old_tuple].all(axis=1)
-            supports = X[mask_rows][:, valid_items].sum(axis=0)
-        valid_indices = (supports >= threshold).nonzero()[0]
-        for index in valid_indices:
-            yield supports[index]
-            yield from old_tuple
-            yield valid_items[index]
+    length = len(old_combinations)
+    set_old_combinations = set(old_combinations)
+    for i, old_combination in enumerate(old_combinations):
+        head_i = list(old_combination[:-1])
+        j = i + 1
+        while j < length:
+            *head_j, tail_j = old_combinations[j]
+            if head_i != head_j:
+                break
+            # Prune old_combination+(item,) if any subset is not frequent
+            candidate = old_combination + (tail_j,)
+            # No need to check the last two values, because test_candidate
+            # is then old_combinations[i] and old_combinations[j]
+            for idx in range(len(candidate) - 2):
+                test_candidate = list(candidate)
+                del test_candidate[idx]
+                if tuple(test_candidate) not in set_old_combinations:
+                    # early exit from for-loop skips else clause just below
+                    break
+            else:
+                yield candidate
+            j = j + 1
 
 
 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
@@ -168,16 +101,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
       possible itemsets lengths (under the apriori condition) are evaluated.
 
     verbose : int (default: 0)
-      Shows the number of iterations if >= 1 and `low_memory` is `True`. If
-      >=1 and `low_memory` is `False`, shows the number of combinations.
-
-    low_memory : bool (default: False)
-      If `True`, uses an iterator to search for combinations above
-      `min_support`.
-      Note that while `low_memory=True` should only be used for large dataset
-      if memory resources are limited, because this implementation is approx.
-      3-6x slower than the default.
-
+      Shows the number of combinations if >= 1.
 
     Returns
     -----------
@@ -197,32 +121,6 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
 
     """
 
-    def _support(_x, _n_rows, _is_sparse):
-        """DRY private method to calculate support as the
-        row-wise sum of values / number of rows
-
-        Parameters
-        -----------
-
-        _x : matrix of bools or binary
-
-        _n_rows : numeric, number of rows in _x
-
-        _is_sparse : bool True if _x is sparse
-
-        Returns
-        -----------
-        np.array, shape = (n_rows, )
-
-        Examples
-        -----------
-        For usage examples, please see
-        http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
-
-        """
-        out = (np.sum(_x, axis=0) / _n_rows)
-        return np.array(out).reshape(-1)
-
     if min_support <= 0.:
         raise ValueError('`min_support` must be a positive '
                          'number within the interval `(0, 1]`. '
@@ -240,80 +138,77 @@ def _support(_x, _n_rows, _is_sparse):
             X = df.values
         else:
             X = df.to_coo().tocsc()
+            # See comment below
+            X.eliminate_zeros()
         is_sparse = True
     elif hasattr(df, "sparse"):
         # DataFrame with SparseArray (pandas >= 0.24)
         if df.size == 0:
             X = df.values
         else:
             X = df.sparse.to_coo().tocsc()
+            # See comment below
+            X.eliminate_zeros()
         is_sparse = True
     else:
         # dense DataFrame
         X = df.values
         is_sparse = False
-    support = _support(X, X.shape[0], is_sparse)
-    ary_col_idx = np.arange(X.shape[1])
+    if is_sparse:
+        # Count nonnull entries via direct access to X indices;
+        # this requires X to be stored in CSC format, and to call
+        # X.eliminate_zeros() to remove null entries from X.
+        support = np.array([X.indptr[idx+1] - X.indptr[idx]
+                            for idx in range(X.shape[1])], dtype=int)
+    else:
+        # Faster than np.count_nonzero(X, axis=0) or np.sum(X, axis=0), why?
+        support = np.array([np.count_nonzero(X[:, idx])
+                            for idx in range(X.shape[1])], dtype=int)
+    support = support / X.shape[0]
     support_dict = {1: support[support >= min_support]}
-    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
+    itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]}
     max_itemset = 1
-    rows_count = float(X.shape[0])
-
-    all_ones = np.ones((int(rows_count), 1))
 
     while max_itemset and max_itemset < (max_len or float('inf')):
         next_max_itemset = max_itemset + 1
 
-        # With exceptionally large datasets, the matrix operations can use a
-        # substantial amount of memory. For low memory applications or large
-        # datasets, set `low_memory=True` to use a slower but more memory-
-        # efficient implementation.
-        if low_memory:
-            combin = generate_new_combinations_low_memory(
-                itemset_dict[max_itemset], X, min_support, is_sparse)
-            # slightly faster than creating an array from a list of tuples
-            combin = np.fromiter(combin, dtype=int)
-            combin = combin.reshape(-1, next_max_itemset + 1)
-
-            if combin.size == 0:
-                break
-            if verbose:
-                print(
-                    '\rProcessing %d combinations | Sampling itemset size %d' %
-                    (combin.size, next_max_itemset), end="")
-
-            itemset_dict[next_max_itemset] = combin[:, 1:]
-            support_dict[next_max_itemset] = combin[:, 0].astype(float) \
-                / rows_count
-            max_itemset = next_max_itemset
+        combin = generate_new_combinations(itemset_dict[max_itemset])
+        # count supports
+        frequent_itemsets = []
+        frequent_supports = []
+        processed = 0
+        if is_sparse:
+            count = np.empty(X.shape[0], dtype=int)
+            for itemset in combin:
+                processed += 1
+                count[:] = 0
+                for item in itemset:
+                    count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
+                support = np.count_nonzero(count == len(itemset)) / X.shape[0]
+                if support >= min_support:
+                    frequent_itemsets.append(itemset)
+                    frequent_supports.append(support)
         else:
-            combin = generate_new_combinations(itemset_dict[max_itemset])
-            combin = np.fromiter(combin, dtype=int)
-            combin = combin.reshape(-1, next_max_itemset)
-
-            if combin.size == 0:
-                break
-            if verbose:
-                print(
-                    '\rProcessing %d combinations | Sampling itemset size %d' %
-                    (combin.size, next_max_itemset), end="")
-
-            if is_sparse:
-                _bools = X[:, combin[:, 0]] == all_ones
-                for n in range(1, combin.shape[1]):
-                    _bools = _bools & (X[:, combin[:, n]] == all_ones)
-            else:
-                _bools = np.all(X[:, combin], axis=2)
-
-            support = _support(np.array(_bools), rows_count, is_sparse)
-            _mask = (support >= min_support).reshape(-1)
-            if any(_mask):
-                itemset_dict[next_max_itemset] = np.array(combin[_mask])
-                support_dict[next_max_itemset] = np.array(support[_mask])
-                max_itemset = next_max_itemset
-            else:
-                # Exit condition
-                break
+            _bools = np.empty(X.shape[0], dtype=bool)
+            for itemset in combin:
+                processed += 1
+                _bools.fill(True)
+                for item in itemset:
+                    np.logical_and(_bools, X[:, item], out=_bools)
+                support = np.count_nonzero(_bools) / X.shape[0]
+                if support >= min_support:
+                    frequent_itemsets.append(itemset)
+                    frequent_supports.append(support)
+        if not frequent_itemsets:
+            # Exit condition
+            break
+        if verbose:
+            print(
+                '\rProcessed %d combinations | Sampling itemset size %d' %
+                (processed, next_max_itemset), end="")
+        itemset_dict[next_max_itemset] = frequent_itemsets
+        support_dict[next_max_itemset] = frequent_supports
+        max_itemset = next_max_itemset
 
     all_res = []
     for k in sorted(itemset_dict):