Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Implement apriori-gen as in original paper #646

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 85 additions & 190 deletions mlxtend/frequent_patterns/apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,24 @@ def generate_new_combinations(old_combinations):
Generator of all combinations based on the last state of Apriori algorithm
Parameters
-----------
old_combinations: np.array
old_combinations: list of tuples
All combinations with enough support in the last step
Combinations are represented by a matrix.
Number of columns is equal to the combination size
Combinations are represented by a list of tuples.
All tuples have the same length, which is equal to the combination size
of the previous step.
Each row represents one combination
Each tuple represents one combination
and contains item type ids in the ascending order
```
0 1
0 15 20
1 15 22
2 17 19
15 20
15 22
17 19
```

Returns
-----------
Generator of all combinations from the last step x items
from the previous step.
Generator of combinations based on the last state of Apriori algorithm.
In order to reduce number of candidates, this function implements the
apriori-gen function described in section 2.1.1 of Apriori paper.

Examples
-----------
Expand All @@ -40,95 +40,28 @@ def generate_new_combinations(old_combinations):

"""

items_types_in_previous_step = np.unique(old_combinations.flatten())
for old_combination in old_combinations:
max_combination = old_combination[-1]
mask = items_types_in_previous_step > max_combination
valid_items = items_types_in_previous_step[mask]
old_tuple = tuple(old_combination)
for item in valid_items:
yield from old_tuple
yield item


def generate_new_combinations_low_memory(old_combinations, X, min_support,
is_sparse):
"""
Generator of all combinations based on the last state of Apriori algorithm
Parameters
-----------
old_combinations: np.array
All combinations with enough support in the last step
Combinations are represented by a matrix.
Number of columns is equal to the combination size
of the previous step.
Each row represents one combination
and contains item type ids in the ascending order
```
0 1
0 15 20
1 15 22
2 17 19
```

X: np.array or scipy sparse matrix
The allowed values are either 0/1 or True/False.
For example,

```
0 True False True True False True
1 True False True False False True
2 True False True False False False
3 True True False False False False
4 False False True True True True
5 False False True False True True
6 False False True False True False
7 True True False False False False
```

min_support : float (default: 0.5)
A float between 0 and 1 for minumum support of the itemsets returned.
The support is computed as the fraction
`transactions_where_item(s)_occur / total_transactions`.

is_sparse : bool True if X is sparse

Returns
-----------
Generator of all combinations from the last step x items
from the previous step. Every combination contains the
number of transactions where this item occurs, followed
by item type ids in the ascending order.
No combination other than generated
do not have a chance to get enough support

Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/

"""

items_types_in_previous_step = np.unique(old_combinations.flatten())
rows_count = X.shape[0]
threshold = min_support * rows_count
for old_combination in old_combinations:
max_combination = old_combination[-1]
mask = items_types_in_previous_step > max_combination
valid_items = items_types_in_previous_step[mask]
old_tuple = tuple(old_combination)
if is_sparse:
mask_rows = X[:, old_tuple].toarray().all(axis=1)
X_cols = X[:, valid_items].toarray()
supports = X_cols[mask_rows].sum(axis=0)
else:
mask_rows = X[:, old_tuple].all(axis=1)
supports = X[mask_rows][:, valid_items].sum(axis=0)
valid_indices = (supports >= threshold).nonzero()[0]
for index in valid_indices:
yield supports[index]
yield from old_tuple
yield valid_items[index]
length = len(old_combinations)
set_old_combinations = set(old_combinations)
for i, old_combination in enumerate(old_combinations):
head_i = list(old_combination[:-1])
j = i + 1
while j < length:
*head_j, tail_j = old_combinations[j]
if head_i != head_j:
break
# Prune old_combination+(item,) if any subset is not frequent
candidate = old_combination + (tail_j,)
# No need to check the last two values, because test_candidate
# is then old_combinations[i] and old_combinations[j]
for idx in range(len(candidate) - 2):
test_candidate = list(candidate)
del test_candidate[idx]
if tuple(test_candidate) not in set_old_combinations:
# early exit from for-loop skips else clause just below
break
else:
yield candidate
j = j + 1


def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
Expand Down Expand Up @@ -168,16 +101,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
possible itemsets lengths (under the apriori condition) are evaluated.

verbose : int (default: 0)
Shows the number of iterations if >= 1 and `low_memory` is `True`. If
>=1 and `low_memory` is `False`, shows the number of combinations.

low_memory : bool (default: False)
If `True`, uses an iterator to search for combinations above
`min_support`.
Note that while `low_memory=True` should only be used for large dataset
if memory resources are limited, because this implementation is approx.
3-6x slower than the default.

Shows the number of combinations if >= 1.

Returns
-----------
Expand All @@ -197,32 +121,6 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,

"""

def _support(_x, _n_rows, _is_sparse):
"""DRY private method to calculate support as the
row-wise sum of values / number of rows

Parameters
-----------

_x : matrix of bools or binary

_n_rows : numeric, number of rows in _x

_is_sparse : bool True if _x is sparse

Returns
-----------
np.array, shape = (n_rows, )

Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

"""
out = (np.sum(_x, axis=0) / _n_rows)
return np.array(out).reshape(-1)

if min_support <= 0.:
raise ValueError('`min_support` must be a positive '
'number within the interval `(0, 1]`. '
Expand All @@ -240,80 +138,77 @@ def _support(_x, _n_rows, _is_sparse):
X = df.values
else:
X = df.to_coo().tocsc()
# See comment below
X.eliminate_zeros()
is_sparse = True
elif hasattr(df, "sparse"):
# DataFrame with SparseArray (pandas >= 0.24)
if df.size == 0:
X = df.values
else:
X = df.sparse.to_coo().tocsc()
# See comment below
X.eliminate_zeros()
is_sparse = True
else:
# dense DataFrame
X = df.values
is_sparse = False
support = _support(X, X.shape[0], is_sparse)
ary_col_idx = np.arange(X.shape[1])
if is_sparse:
# Count nonnull entries via direct access to X indices;
# this requires X to be stored in CSC format, and to call
# X.eliminate_zeros() to remove null entries from X.
support = np.array([X.indptr[idx+1] - X.indptr[idx]
for idx in range(X.shape[1])], dtype=int)
else:
# Faster than np.count_nonzero(X, axis=0) or np.sum(X, axis=0), why?
support = np.array([np.count_nonzero(X[:, idx])
for idx in range(X.shape[1])], dtype=int)
support = support / X.shape[0]
support_dict = {1: support[support >= min_support]}
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
itemset_dict = {1: [(idx,) for idx in np.where(support >= min_support)[0]]}
max_itemset = 1
rows_count = float(X.shape[0])

all_ones = np.ones((int(rows_count), 1))

while max_itemset and max_itemset < (max_len or float('inf')):
next_max_itemset = max_itemset + 1

# With exceptionally large datasets, the matrix operations can use a
# substantial amount of memory. For low memory applications or large
# datasets, set `low_memory=True` to use a slower but more memory-
# efficient implementation.
if low_memory:
combin = generate_new_combinations_low_memory(
itemset_dict[max_itemset], X, min_support, is_sparse)
# slightly faster than creating an array from a list of tuples
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset + 1)

if combin.size == 0:
break
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")

itemset_dict[next_max_itemset] = combin[:, 1:]
support_dict[next_max_itemset] = combin[:, 0].astype(float) \
/ rows_count
max_itemset = next_max_itemset
combin = generate_new_combinations(itemset_dict[max_itemset])
# count supports
frequent_itemsets = []
frequent_supports = []
processed = 0
if is_sparse:
count = np.empty(X.shape[0], dtype=int)
for itemset in combin:
processed += 1
count[:] = 0
for item in itemset:
count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
support = np.count_nonzero(count == len(itemset)) / X.shape[0]
if support >= min_support:
frequent_itemsets.append(itemset)
frequent_supports.append(support)
else:
combin = generate_new_combinations(itemset_dict[max_itemset])
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset)

if combin.size == 0:
break
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")

if is_sparse:
_bools = X[:, combin[:, 0]] == all_ones
for n in range(1, combin.shape[1]):
_bools = _bools & (X[:, combin[:, n]] == all_ones)
else:
_bools = np.all(X[:, combin], axis=2)

support = _support(np.array(_bools), rows_count, is_sparse)
_mask = (support >= min_support).reshape(-1)
if any(_mask):
itemset_dict[next_max_itemset] = np.array(combin[_mask])
support_dict[next_max_itemset] = np.array(support[_mask])
max_itemset = next_max_itemset
else:
# Exit condition
break
_bools = np.empty(X.shape[0], dtype=bool)
for itemset in combin:
processed += 1
_bools.fill(True)
for item in itemset:
np.logical_and(_bools, X[:, item], out=_bools)
support = np.count_nonzero(_bools) / X.shape[0]
if support >= min_support:
frequent_itemsets.append(itemset)
frequent_supports.append(support)
if not frequent_itemsets:
# Exit condition
break
if verbose:
print(
'\rProcessed %d combinations | Sampling itemset size %d' %
(processed, next_max_itemset), end="")
itemset_dict[next_max_itemset] = frequent_itemsets
support_dict[next_max_itemset] = frequent_supports
max_itemset = next_max_itemset

all_res = []
for k in sorted(itemset_dict):
Expand Down
Loading