From 0ebffa25f182cb13626d927b369dcd444d3fc3f0 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 16 Jun 2018 14:52:12 -0400 Subject: [PATCH 1/2] use frozensets in apriori --- mlxtend/frequent_patterns/apriori.py | 11 ++++-- .../frequent_patterns/association_rules.py | 16 ++++++-- .../frequent_patterns/tests/test_apriori.py | 19 ++++++++- .../tests/test_association_rules.py | 39 +++++++++++++++++++ 4 files changed, 76 insertions(+), 9 deletions(-) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index 18e3c1fdd..ce840dcaa 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -88,6 +88,11 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None): pandas DataFrame with columns ['support', 'itemsets'] of all itemsets that are >= `min_support` and < than `max_len` (if `max_len` is not None). + Each itemset in the 'itemsets' column is of type `frozenset`, + which is a Python built-in type that behaves similarly to + sets except that it is immutable + (For more info, see + https://docs.python.org/3.6/library/stdtypes.html#frozenset). Examples ----------- @@ -130,7 +135,7 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None): all_res = [] for k in sorted(itemset_dict): support = pd.Series(support_dict[k]) - itemsets = pd.Series([set(i) for i in itemset_dict[k]]) + itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]]) res = pd.concat((support, itemsets), axis=1) all_res.append(res) @@ -139,8 +144,8 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None): res_df.columns = ['support', 'itemsets'] if use_colnames: mapping = {idx: item for idx, item in enumerate(df.columns)} - res_df['itemsets'] = res_df['itemsets'].apply(lambda x: set([mapping[i] - for i in x])) + res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([ + mapping[i] for i in x])) res_df = res_df.reset_index(drop=True) return res_df diff --git a/mlxtend/frequent_patterns/association_rules.py b/mlxtend/frequent_patterns/association_rules.py index d6cd17c14..c393151be 100644 --- a/mlxtend/frequent_patterns/association_rules.py +++ b/mlxtend/frequent_patterns/association_rules.py @@ -32,8 +32,10 @@ def association_rules(df, metric="confidence", min_threshold=0.8): - support(A->C) = support(A+C) [aka 'support'], range: [0, 1]\n - confidence(A->C) = support(A+C) / support(A), range: [0, 1]\n - lift(A->C) = confidence(A->C) / support(C), range: [0, inf]\n - - leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1]\n - - conviction = [1 - support(C)] / [1 - confidence(A->C)], range: [0, inf]\n + - leverage(A->C) = support(A->C) - support(A)*support(C), + range: [-1, 1]\n + - conviction = [1 - support(C)] / [1 - confidence(A->C)], + range: [0, inf]\n min_threshold : float (default: 0.8) Minimal threshold for the evaluation metric @@ -41,12 +43,18 @@ def association_rules(df, metric="confidence", min_threshold=0.8): Returns ---------- - pandas DataFrame with columns "antecedent support", - "consequent support", + pandas DataFrame with columns "antecedants" and "consequents" + that store itemsets, plus the scoring metric columns: + "antecedent support", "consequent support", "support", "confidence", "lift", "leverage", "conviction" of all rules for which metric(rule) >= min_threshold. + Each entry in the "antecedants" and "consequents" columns are + of type `frozenset`, which is a Python built-in type that + behaves similarly to sets except that it is immutable + (For more info, see + https://docs.python.org/3.6/library/stdtypes.html#frozenset). Examples ----------- diff --git a/mlxtend/frequent_patterns/tests/test_apriori.py b/mlxtend/frequent_patterns/tests/test_apriori.py index 967345398..110fb1abb 100644 --- a/mlxtend/frequent_patterns/tests/test_apriori.py +++ b/mlxtend/frequent_patterns/tests/test_apriori.py @@ -57,8 +57,23 @@ def test_max_len(): def test_itemsets_type(): res_colindice = apriori(df, use_colnames=False) # This is default behavior for i in res_colindice['itemsets']: - assert isinstance(i, set) is True + assert isinstance(i, frozenset) is True res_colnames = apriori(df, use_colnames=True) for i in res_colnames['itemsets']: - assert isinstance(i, set) is True + assert isinstance(i, frozenset) is True + + +def test_frozenset_selection(): + res_df = apriori(df, use_colnames=True) + assert res_df.values.shape == (11, 2) + assert res_df[res_df['itemsets'] + == 'nothing'].values.shape == (0, 2) + assert res_df[res_df['itemsets'] + == {'Eggs', 'Kidney Beans'}].values.shape == (1, 2) + assert res_df[res_df['itemsets'] + == frozenset(('Eggs', 'Kidney Beans'))].values.shape\ + == (1, 2) + assert res_df[res_df['itemsets'] + == frozenset(('Kidney Beans', 'Eggs'))].values.shape\ + == (1, 2) diff --git a/mlxtend/frequent_patterns/tests/test_association_rules.py b/mlxtend/frequent_patterns/tests/test_association_rules.py index c41239a92..17aa75500 100644 --- a/mlxtend/frequent_patterns/tests/test_association_rules.py +++ b/mlxtend/frequent_patterns/tests/test_association_rules.py @@ -56,6 +56,29 @@ def test_default(): assert res_df.equals(expect), res_df +def test_datatypes(): + res_df = association_rules(df_freq_items) + for i in res_df['antecedants']: + assert isinstance(i, frozenset) is True + + for i in res_df['consequents']: + assert isinstance(i, frozenset) is True + + # cast itemset-containing dataframe to set and + # check if association_rule converts it internally + # back to frozensets + df_freq_items_copy = df_freq_items.copy() + df_freq_items_copy['itemsets'] = df_freq_items_copy['itemsets']\ + .apply(lambda x: set(x)) + + res_df = association_rules(df_freq_items) + for i in res_df['antecedants']: + assert isinstance(i, frozenset) is True + + for i in res_df['consequents']: + assert isinstance(i, frozenset) is True + + def test_no_support_col(): df_no_support_col = df_freq_items.loc[:, ['itemsets']] assert_raises(ValueError, association_rules, df_no_support_col) @@ -130,3 +153,19 @@ def test_confidence(): min_threshold=0.8, metric='confidence') assert res_df.values.shape[0] == 9 + + +def test_frozenset_selection(): + res_df = association_rules(df_freq_items) + + sel = res_df[res_df['consequents'] == frozenset((3, 5))] + assert sel.values.shape[0] == 1 + + sel = res_df[res_df['consequents'] == frozenset((5, 3))] + assert sel.values.shape[0] == 1 + + sel = res_df[res_df['consequents'] == {3, 5}] + assert sel.values.shape[0] == 1 + + sel = res_df[res_df['antecedants'] == frozenset((8, 3))] + assert sel.values.shape[0] == 1 From 203951f8281a4c9d23105ec35df9e4ef5f81a755 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 16 Jun 2018 17:46:48 -0400 Subject: [PATCH 2/2] update docs --- docs/sources/CHANGELOG.md | 2 +- .../frequent_patterns/apriori.ipynb | 283 ++++++++++++------ .../frequent_patterns/association_rules.ipynb | 243 ++++++++++----- 3 files changed, 361 insertions(+), 167 deletions(-) diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index c52e175f3..43f87beb2 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -26,7 +26,7 @@ The CHANGELOG for the current development version is available at ##### Changes -- Itemsets generated with `apriori` are now sets ([#344](https://github.com/rasbt/mlxtend/issues/344) by [William Laney](https://github.com/WLaney)) +- Itemsets generated with `apriori` are now `frozenset`s ([#393](https://github.com/rasbt/mlxtend/issues/393) by [William Laney](https://github.com/WLaney) and [#394](https://github.com/rasbt/mlxtend/issues/394)) ##### Bug Fixes diff --git a/docs/sources/user_guide/frequent_patterns/apriori.ipynb b/docs/sources/user_guide/frequent_patterns/apriori.ipynb index 7ceb8525f..78d260701 100644 --- a/docs/sources/user_guide/frequent_patterns/apriori.ipynb +++ b/docs/sources/user_guide/frequent_patterns/apriori.ipynb @@ -48,7 +48,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 1" + "## Example 1 -- Generating Frequent Itemsets" ] }, { @@ -267,57 +267,57 @@ " \n", " 0\n", " 0.8\n", - " [3]\n", + " (3)\n", " \n", " \n", " 1\n", " 1.0\n", - " [5]\n", + " (5)\n", " \n", " \n", " 2\n", " 0.6\n", - " [6]\n", + " (6)\n", " \n", " \n", " 3\n", " 0.6\n", - " [8]\n", + " (8)\n", " \n", " \n", " 4\n", " 0.6\n", - " [10]\n", + " (10)\n", " \n", " \n", " 5\n", " 0.8\n", - " [3, 5]\n", + " (3, 5)\n", " \n", " \n", " 6\n", " 0.6\n", - " [3, 8]\n", + " (8, 3)\n", " \n", " \n", " 7\n", " 0.6\n", - " [5, 6]\n", + " (5, 6)\n", " \n", " \n", " 8\n", " 0.6\n", - " [5, 8]\n", + " (8, 5)\n", " \n", " \n", " 9\n", " 0.6\n", - " [5, 10]\n", + " (10, 5)\n", " \n", " \n", " 10\n", " 0.6\n", - " [3, 5, 8]\n", + " (8, 3, 5)\n", " \n", " \n", "\n", @@ -325,17 +325,17 @@ ], "text/plain": [ " support itemsets\n", - "0 0.8 [3]\n", - "1 1.0 [5]\n", - "2 0.6 [6]\n", - "3 0.6 [8]\n", - "4 0.6 [10]\n", - "5 0.8 [3, 5]\n", - "6 0.6 [3, 8]\n", - "7 0.6 [5, 6]\n", - "8 0.6 [5, 8]\n", - "9 0.6 [5, 10]\n", - "10 0.6 [3, 5, 8]" + "0 0.8 (3)\n", + "1 1.0 (5)\n", + "2 0.6 (6)\n", + "3 0.6 (8)\n", + "4 0.6 (10)\n", + "5 0.8 (3, 5)\n", + "6 0.6 (8, 3)\n", + "7 0.6 (5, 6)\n", + "8 0.6 (8, 5)\n", + "9 0.6 (10, 5)\n", + "10 0.6 (8, 3, 5)" ] }, "execution_count": 3, @@ -390,57 +390,57 @@ " \n", " 0\n", " 0.8\n", - " [Eggs]\n", + " (Eggs)\n", " \n", " \n", " 1\n", " 1.0\n", - " [Kidney Beans]\n", + " (Kidney Beans)\n", " \n", " \n", " 2\n", " 0.6\n", - " [Milk]\n", + " (Milk)\n", " \n", " \n", " 3\n", " 0.6\n", - " [Onion]\n", + " (Onion)\n", " \n", " \n", " 4\n", " 0.6\n", - " [Yogurt]\n", + " (Yogurt)\n", " \n", " \n", " 5\n", " 0.8\n", - " [Eggs, Kidney Beans]\n", + " (Eggs, Kidney Beans)\n", " \n", " \n", " 6\n", " 0.6\n", - " [Eggs, Onion]\n", + " (Onion, Eggs)\n", " \n", " \n", " 7\n", " 0.6\n", - " [Kidney Beans, Milk]\n", + " (Milk, Kidney Beans)\n", " \n", " \n", " 8\n", " 0.6\n", - " [Kidney Beans, Onion]\n", + " (Onion, Kidney Beans)\n", " \n", " \n", " 9\n", " 0.6\n", - " [Kidney Beans, Yogurt]\n", + " (Kidney Beans, Yogurt)\n", " \n", " \n", " 10\n", " 0.6\n", - " [Eggs, Kidney Beans, Onion]\n", + " (Onion, Eggs, Kidney Beans)\n", " \n", " \n", "\n", @@ -448,17 +448,17 @@ ], "text/plain": [ " support itemsets\n", - "0 0.8 [Eggs]\n", - "1 1.0 [Kidney Beans]\n", - "2 0.6 [Milk]\n", - "3 0.6 [Onion]\n", - "4 0.6 [Yogurt]\n", - "5 0.8 [Eggs, Kidney Beans]\n", - "6 0.6 [Eggs, Onion]\n", - "7 0.6 [Kidney Beans, Milk]\n", - "8 0.6 [Kidney Beans, Onion]\n", - "9 0.6 [Kidney Beans, Yogurt]\n", - "10 0.6 [Eggs, Kidney Beans, Onion]" + "0 0.8 (Eggs)\n", + "1 1.0 (Kidney Beans)\n", + "2 0.6 (Milk)\n", + "3 0.6 (Onion)\n", + "4 0.6 (Yogurt)\n", + "5 0.8 (Eggs, Kidney Beans)\n", + "6 0.6 (Onion, Eggs)\n", + "7 0.6 (Milk, Kidney Beans)\n", + "8 0.6 (Onion, Kidney Beans)\n", + "9 0.6 (Kidney Beans, Yogurt)\n", + "10 0.6 (Onion, Eggs, Kidney Beans)" ] }, "execution_count": 4, @@ -474,7 +474,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 2" + "## Example 2 -- Selecting and Filtering Results" ] }, { @@ -519,67 +519,67 @@ " \n", " 0\n", " 0.8\n", - " [Eggs]\n", + " (Eggs)\n", " 1\n", " \n", " \n", " 1\n", " 1.0\n", - " [Kidney Beans]\n", + " (Kidney Beans)\n", " 1\n", " \n", " \n", " 2\n", " 0.6\n", - " [Milk]\n", + " (Milk)\n", " 1\n", " \n", " \n", " 3\n", " 0.6\n", - " [Onion]\n", + " (Onion)\n", " 1\n", " \n", " \n", " 4\n", " 0.6\n", - " [Yogurt]\n", + " (Yogurt)\n", " 1\n", " \n", " \n", " 5\n", " 0.8\n", - " [Eggs, Kidney Beans]\n", + " (Eggs, Kidney Beans)\n", " 2\n", " \n", " \n", " 6\n", " 0.6\n", - " [Eggs, Onion]\n", + " (Onion, Eggs)\n", " 2\n", " \n", " \n", " 7\n", " 0.6\n", - " [Kidney Beans, Milk]\n", + " (Milk, Kidney Beans)\n", " 2\n", " \n", " \n", " 8\n", " 0.6\n", - " [Kidney Beans, Onion]\n", + " (Onion, Kidney Beans)\n", " 2\n", " \n", " \n", " 9\n", " 0.6\n", - " [Kidney Beans, Yogurt]\n", + " (Kidney Beans, Yogurt)\n", " 2\n", " \n", " \n", " 10\n", " 0.6\n", - " [Eggs, Kidney Beans, Onion]\n", + " (Onion, Eggs, Kidney Beans)\n", " 3\n", " \n", " \n", @@ -588,17 +588,17 @@ ], "text/plain": [ " support itemsets length\n", - "0 0.8 [Eggs] 1\n", - "1 1.0 [Kidney Beans] 1\n", - "2 0.6 [Milk] 1\n", - "3 0.6 [Onion] 1\n", - "4 0.6 [Yogurt] 1\n", - "5 0.8 [Eggs, Kidney Beans] 2\n", - "6 0.6 [Eggs, Onion] 2\n", - "7 0.6 [Kidney Beans, Milk] 2\n", - "8 0.6 [Kidney Beans, Onion] 2\n", - "9 0.6 [Kidney Beans, Yogurt] 2\n", - "10 0.6 [Eggs, Kidney Beans, Onion] 3" + "0 0.8 (Eggs) 1\n", + "1 1.0 (Kidney Beans) 1\n", + "2 0.6 (Milk) 1\n", + "3 0.6 (Onion) 1\n", + "4 0.6 (Yogurt) 1\n", + "5 0.8 (Eggs, Kidney Beans) 2\n", + "6 0.6 (Onion, Eggs) 2\n", + "7 0.6 (Milk, Kidney Beans) 2\n", + "8 0.6 (Onion, Kidney Beans) 2\n", + "9 0.6 (Kidney Beans, Yogurt) 2\n", + "10 0.6 (Onion, Eggs, Kidney Beans) 3" ] }, "execution_count": 5, @@ -654,7 +654,7 @@ " \n", " 5\n", " 0.8\n", - " [Eggs, Kidney Beans]\n", + " (Eggs, Kidney Beans)\n", " 2\n", " \n", " \n", @@ -663,7 +663,7 @@ ], "text/plain": [ " support itemsets length\n", - "5 0.8 [Eggs, Kidney Beans] 2" + "5 0.8 (Eggs, Kidney Beans) 2" ] }, "execution_count": 6, @@ -680,7 +680,89 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 3 - Sparse Representation" + "Similarly, using the Pandas API, we can select entries based on the \"itemsets\" column:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
supportitemsetslength
60.6(Onion, Eggs)2
\n", + "
" + ], + "text/plain": [ + " support itemsets length\n", + "6 0.6 (Onion, Eggs) 2" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Frozensets**\n", + "\n", + "Note that the entries in the \"itemsets\" column are of type `frozenset`, which is built-in Python type that is similar to a Python `set` but immutable, which makes it more efficient for certain query or comparison operations (https://docs.python.org/3.6/library/stdtypes.html#frozenset). Since `frozenset`s are sets, the item order does not matter. I.e., the query\n", + "\n", + "`frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]`\n", + " \n", + "is equivalent to any of the following three\n", + "\n", + "- `frequent_itemsets[ frequent_itemsets['itemsets'] == {'Eggs', 'Onion'} ]`\n", + "- `frequent_itemsets[ frequent_itemsets['itemsets'] == frozenset(('Eggs', 'Onion')) ]`\n", + "- `frequent_itemsets[ frequent_itemsets['itemsets'] == frozenset(('Onion', 'Eggs')) ]`\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 3 -- Working with Sparse Representations" ] }, { @@ -693,7 +775,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -821,7 +903,7 @@ "4 False False " ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -834,7 +916,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -866,57 +948,57 @@ " \n", " 0\n", " 0.8\n", - " [Eggs]\n", + " (Eggs)\n", " \n", " \n", " 1\n", " 1.0\n", - " [Kidney Beans]\n", + " (Kidney Beans)\n", " \n", " \n", " 2\n", " 0.6\n", - " [Milk]\n", + " (Milk)\n", " \n", " \n", " 3\n", " 0.6\n", - " [Onion]\n", + " (Onion)\n", " \n", " \n", " 4\n", " 0.6\n", - " [Yogurt]\n", + " (Yogurt)\n", " \n", " \n", " 5\n", " 0.8\n", - " [Eggs, Kidney Beans]\n", + " (Eggs, Kidney Beans)\n", " \n", " \n", " 6\n", " 0.6\n", - " [Eggs, Onion]\n", + " (Onion, Eggs)\n", " \n", " \n", " 7\n", " 0.6\n", - " [Kidney Beans, Milk]\n", + " (Milk, Kidney Beans)\n", " \n", " \n", " 8\n", " 0.6\n", - " [Kidney Beans, Onion]\n", + " (Onion, Kidney Beans)\n", " \n", " \n", " 9\n", " 0.6\n", - " [Kidney Beans, Yogurt]\n", + " (Kidney Beans, Yogurt)\n", " \n", " \n", " 10\n", " 0.6\n", - " [Eggs, Kidney Beans, Onion]\n", + " (Onion, Eggs, Kidney Beans)\n", " \n", " \n", "\n", @@ -924,20 +1006,20 @@ ], "text/plain": [ " support itemsets\n", - "0 0.8 [Eggs]\n", - "1 1.0 [Kidney Beans]\n", - "2 0.6 [Milk]\n", - "3 0.6 [Onion]\n", - "4 0.6 [Yogurt]\n", - "5 0.8 [Eggs, Kidney Beans]\n", - "6 0.6 [Eggs, Onion]\n", - "7 0.6 [Kidney Beans, Milk]\n", - "8 0.6 [Kidney Beans, Onion]\n", - "9 0.6 [Kidney Beans, Yogurt]\n", - "10 0.6 [Eggs, Kidney Beans, Onion]" + "0 0.8 (Eggs)\n", + "1 1.0 (Kidney Beans)\n", + "2 0.6 (Milk)\n", + "3 0.6 (Onion)\n", + "4 0.6 (Yogurt)\n", + "5 0.8 (Eggs, Kidney Beans)\n", + "6 0.6 (Onion, Eggs)\n", + "7 0.6 (Milk, Kidney Beans)\n", + "8 0.6 (Onion, Kidney Beans)\n", + "9 0.6 (Kidney Beans, Yogurt)\n", + "10 0.6 (Onion, Eggs, Kidney Beans)" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -955,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1009,6 +1091,11 @@ "pandas DataFrame with columns ['support', 'itemsets'] of all itemsets\n", " that are >= `min_support` and < than `max_len`\n", " (if `max_len` is not None).\n", + " Each itemset in the 'itemsets' column is of type `frozenset`,\n", + " which is a Python built-in type that behaves similarly to\n", + " sets except that it is immutable\n", + " (For more info, see\n", + " https://docs.python.org/3.6/library/stdtypes.html#frozenset).\n", "\n", "**Examples**\n", "\n", diff --git a/docs/sources/user_guide/frequent_patterns/association_rules.ipynb b/docs/sources/user_guide/frequent_patterns/association_rules.ipynb index 5f174c454..80250dfd3 100644 --- a/docs/sources/user_guide/frequent_patterns/association_rules.ipynb +++ b/docs/sources/user_guide/frequent_patterns/association_rules.ipynb @@ -119,7 +119,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 1" + "## Example 1 -- Generating Association Rules from Frequent Itemsets" ] }, { @@ -165,57 +165,57 @@ " \n", " 0\n", " 0.8\n", - " [Eggs]\n", + " (Eggs)\n", " \n", " \n", " 1\n", " 1.0\n", - " [Kidney Beans]\n", + " (Kidney Beans)\n", " \n", " \n", " 2\n", " 0.6\n", - " [Milk]\n", + " (Milk)\n", " \n", " \n", " 3\n", " 0.6\n", - " [Onion]\n", + " (Onion)\n", " \n", " \n", " 4\n", " 0.6\n", - " [Yogurt]\n", + " (Yogurt)\n", " \n", " \n", " 5\n", " 0.8\n", - " [Eggs, Kidney Beans]\n", + " (Kidney Beans, Eggs)\n", " \n", " \n", " 6\n", " 0.6\n", - " [Eggs, Onion]\n", + " (Eggs, Onion)\n", " \n", " \n", " 7\n", " 0.6\n", - " [Kidney Beans, Milk]\n", + " (Milk, Kidney Beans)\n", " \n", " \n", " 8\n", " 0.6\n", - " [Kidney Beans, Onion]\n", + " (Kidney Beans, Onion)\n", " \n", " \n", " 9\n", " 0.6\n", - " [Kidney Beans, Yogurt]\n", + " (Yogurt, Kidney Beans)\n", " \n", " \n", " 10\n", " 0.6\n", - " [Eggs, Kidney Beans, Onion]\n", + " (Kidney Beans, Eggs, Onion)\n", " \n", " \n", "\n", @@ -223,17 +223,17 @@ ], "text/plain": [ " support itemsets\n", - "0 0.8 [Eggs]\n", - "1 1.0 [Kidney Beans]\n", - "2 0.6 [Milk]\n", - "3 0.6 [Onion]\n", - "4 0.6 [Yogurt]\n", - "5 0.8 [Eggs, Kidney Beans]\n", - "6 0.6 [Eggs, Onion]\n", - "7 0.6 [Kidney Beans, Milk]\n", - "8 0.6 [Kidney Beans, Onion]\n", - "9 0.6 [Kidney Beans, Yogurt]\n", - "10 0.6 [Eggs, Kidney Beans, Onion]" + "0 0.8 (Eggs)\n", + "1 1.0 (Kidney Beans)\n", + "2 0.6 (Milk)\n", + "3 0.6 (Onion)\n", + "4 0.6 (Yogurt)\n", + "5 0.8 (Kidney Beans, Eggs)\n", + "6 0.6 (Eggs, Onion)\n", + "7 0.6 (Milk, Kidney Beans)\n", + "8 0.6 (Kidney Beans, Onion)\n", + "9 0.6 (Yogurt, Kidney Beans)\n", + "10 0.6 (Kidney Beans, Eggs, Onion)" ] }, "execution_count": 1, @@ -310,27 +310,27 @@ " \n", " \n", " 0\n", - " (Eggs)\n", " (Kidney Beans)\n", - " 0.8\n", + " (Eggs)\n", " 1.0\n", " 0.8\n", - " 1.00\n", + " 0.8\n", + " 0.80\n", " 1.00\n", " 0.00\n", - " inf\n", + " 1.000000\n", " \n", " \n", " 1\n", - " (Kidney Beans)\n", " (Eggs)\n", - " 1.0\n", + " (Kidney Beans)\n", " 0.8\n", + " 1.0\n", " 0.8\n", - " 0.80\n", + " 1.00\n", " 1.00\n", " 0.00\n", - " 1.000000\n", + " inf\n", " \n", " \n", " 2\n", @@ -394,7 +394,7 @@ " \n", " \n", " 7\n", - " (Eggs, Kidney Beans)\n", + " (Kidney Beans, Eggs)\n", " (Onion)\n", " 0.8\n", " 0.6\n", @@ -406,26 +406,26 @@ " \n", " \n", " 8\n", - " (Eggs, Onion)\n", - " (Kidney Beans)\n", + " (Kidney Beans, Onion)\n", + " (Eggs)\n", " 0.6\n", - " 1.0\n", + " 0.8\n", " 0.6\n", " 1.00\n", - " 1.00\n", - " 0.00\n", + " 1.25\n", + " 0.12\n", " inf\n", " \n", " \n", " 9\n", - " (Kidney Beans, Onion)\n", - " (Eggs)\n", + " (Eggs, Onion)\n", + " (Kidney Beans)\n", " 0.6\n", - " 0.8\n", + " 1.0\n", " 0.6\n", " 1.00\n", - " 1.25\n", - " 0.12\n", + " 1.00\n", + " 0.00\n", " inf\n", " \n", " \n", @@ -443,7 +443,7 @@ " \n", " 11\n", " (Onion)\n", - " (Eggs, Kidney Beans)\n", + " (Kidney Beans, Eggs)\n", " 0.6\n", " 0.8\n", " 0.6\n", @@ -458,30 +458,30 @@ ], "text/plain": [ " antecedants consequents antecedent support \\\n", - "0 (Eggs) (Kidney Beans) 0.8 \n", - "1 (Kidney Beans) (Eggs) 1.0 \n", + "0 (Kidney Beans) (Eggs) 1.0 \n", + "1 (Eggs) (Kidney Beans) 0.8 \n", "2 (Eggs) (Onion) 0.8 \n", "3 (Onion) (Eggs) 0.6 \n", "4 (Milk) (Kidney Beans) 0.6 \n", "5 (Onion) (Kidney Beans) 0.6 \n", "6 (Yogurt) (Kidney Beans) 0.6 \n", - "7 (Eggs, Kidney Beans) (Onion) 0.8 \n", - "8 (Eggs, Onion) (Kidney Beans) 0.6 \n", - "9 (Kidney Beans, Onion) (Eggs) 0.6 \n", + "7 (Kidney Beans, Eggs) (Onion) 0.8 \n", + "8 (Kidney Beans, Onion) (Eggs) 0.6 \n", + "9 (Eggs, Onion) (Kidney Beans) 0.6 \n", "10 (Eggs) (Kidney Beans, Onion) 0.8 \n", - "11 (Onion) (Eggs, Kidney Beans) 0.6 \n", + "11 (Onion) (Kidney Beans, Eggs) 0.6 \n", "\n", " consequent support support confidence lift leverage conviction \n", - "0 1.0 0.8 1.00 1.00 0.00 inf \n", - "1 0.8 0.8 0.80 1.00 0.00 1.000000 \n", + "0 0.8 0.8 0.80 1.00 0.00 1.000000 \n", + "1 1.0 0.8 1.00 1.00 0.00 inf \n", "2 0.6 0.6 0.75 1.25 0.12 1.600000 \n", "3 0.8 0.6 1.00 1.25 0.12 inf \n", "4 1.0 0.6 1.00 1.00 0.00 inf \n", "5 1.0 0.6 1.00 1.00 0.00 inf \n", "6 1.0 0.6 1.00 1.00 0.00 inf \n", "7 0.6 0.6 0.75 1.25 0.12 1.600000 \n", - "8 1.0 0.6 1.00 1.00 0.00 inf \n", - "9 0.8 0.6 1.00 1.25 0.12 inf \n", + "8 0.8 0.6 1.00 1.25 0.12 inf \n", + "9 1.0 0.6 1.00 1.00 0.00 inf \n", "10 0.6 0.6 0.75 1.25 0.12 1.600000 \n", "11 0.8 0.6 1.00 1.25 0.12 inf " ] @@ -501,14 +501,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 2" + "## Example 2 -- Rule Generation and Selection Criteria" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If you are interested in rules fulfilling a different interest metric, you can simply adjust the parameters. E.g. if you are interested only in rules that have a lift score of >= 1.2, you would do the following:" + "If you are interested in rules according to a different metric of interest, you can simply adjust the `metric` and `min_threshold` arguments . E.g. if you are only interested in rules that have a lift score of >= 1.2, you would do the following:" ] }, { @@ -575,7 +575,7 @@ " \n", " \n", " 2\n", - " (Eggs, Kidney Beans)\n", + " (Kidney Beans, Eggs)\n", " (Onion)\n", " 0.8\n", " 0.6\n", @@ -612,7 +612,7 @@ " \n", " 5\n", " (Onion)\n", - " (Eggs, Kidney Beans)\n", + " (Kidney Beans, Eggs)\n", " 0.6\n", " 0.8\n", " 0.6\n", @@ -629,10 +629,10 @@ " antecedants consequents antecedent support \\\n", "0 (Eggs) (Onion) 0.8 \n", "1 (Onion) (Eggs) 0.6 \n", - "2 (Eggs, Kidney Beans) (Onion) 0.8 \n", + "2 (Kidney Beans, Eggs) (Onion) 0.8 \n", "3 (Kidney Beans, Onion) (Eggs) 0.6 \n", "4 (Eggs) (Kidney Beans, Onion) 0.8 \n", - "5 (Onion) (Eggs, Kidney Beans) 0.6 \n", + "5 (Onion) (Kidney Beans, Eggs) 0.6 \n", "\n", " consequent support support confidence lift leverage conviction \n", "0 0.6 0.6 0.75 1.25 0.12 1.600000 \n", @@ -733,7 +733,7 @@ " \n", " \n", " 2\n", - " (Eggs, Kidney Beans)\n", + " (Kidney Beans, Eggs)\n", " (Onion)\n", " 0.8\n", " 0.6\n", @@ -773,7 +773,7 @@ " \n", " 5\n", " (Onion)\n", - " (Eggs, Kidney Beans)\n", + " (Kidney Beans, Eggs)\n", " 0.6\n", " 0.8\n", " 0.6\n", @@ -791,10 +791,10 @@ " antecedants consequents antecedent support \\\n", "0 (Eggs) (Onion) 0.8 \n", "1 (Onion) (Eggs) 0.6 \n", - "2 (Eggs, Kidney Beans) (Onion) 0.8 \n", + "2 (Kidney Beans, Eggs) (Onion) 0.8 \n", "3 (Kidney Beans, Onion) (Eggs) 0.6 \n", "4 (Eggs) (Kidney Beans, Onion) 0.8 \n", - "5 (Onion) (Eggs, Kidney Beans) 0.6 \n", + "5 (Onion) (Kidney Beans, Eggs) 0.6 \n", "\n", " consequent support support confidence lift leverage conviction \\\n", "0 0.6 0.6 0.75 1.25 0.12 1.600000 \n", @@ -905,6 +905,105 @@ " (rules['lift'] > 1.2) ]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similarly, using the Pandas API, we can select entries based on the \"antecedants\" or \"consequents\" columns:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
antecedantsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconvictionantecedant_len
2(Kidney Beans, Eggs)(Onion)0.80.60.60.751.250.121.62
\n", + "
" + ], + "text/plain": [ + " antecedants consequents antecedent support consequent support \\\n", + "2 (Kidney Beans, Eggs) (Onion) 0.8 0.6 \n", + "\n", + " support confidence lift leverage conviction antecedant_len \n", + "2 0.6 0.75 1.25 0.12 1.6 2 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rules[rules['antecedants'] == {'Eggs', 'Kidney Beans'}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Frozensets**\n", + "\n", + "Note that the entries in the \"itemsets\" column are of type `frozenset`, which is built-in Python type that is similar to a Python `set` but immutable, which makes it more efficient for certain query or comparison operations (https://docs.python.org/3.6/library/stdtypes.html#frozenset). Since `frozenset`s are sets, the item order does not matter. I.e., the query\n", + "\n", + "`rules[rules['antecedants'] == {'Eggs', 'Kidney Beans'}]`\n", + " \n", + "is equivalent to any of the following three\n", + "\n", + "- `rules[rules['antecedants'] == {'Kidney Beans', 'Eggs'}]`\n", + "- `rules[rules['antecedants'] == frozenset(('Eggs', 'Kidney Beans'))]`\n", + "- `rules[rules['antecedants'] == frozenset(('Kidney Beans', 'Eggs'))]`\n", + "\n", + "\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -914,7 +1013,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -949,9 +1048,11 @@ "\n", " - lift(A->C) = confidence(A->C) / support(C), range: [0, inf]\n", "\n", - " - leverage(A->C) = support(A->C) - support(A)*support(C), range: [-1, 1]\n", + " - leverage(A->C) = support(A->C) - support(A)*support(C),\n", + " range: [-1, 1]\n", "\n", - " - conviction = [1 - support(C)] / [1 - confidence(A->C)], range: [0, inf]\n", + " - conviction = [1 - support(C)] / [1 - confidence(A->C)],\n", + " range: [0, inf]\n", "\n", "\n", "\n", @@ -962,12 +1063,18 @@ "\n", "**Returns**\n", "\n", - "pandas DataFrame with columns \"antecedent support\",\n", - " \"consequent support\",\n", + "pandas DataFrame with columns \"antecedants\" and \"consequents\"\n", + " that store itemsets, plus the scoring metric columns:\n", + " \"antecedent support\", \"consequent support\",\n", " \"support\", \"confidence\", \"lift\",\n", " \"leverage\", \"conviction\"\n", " of all rules for which\n", " metric(rule) >= min_threshold.\n", + " Each entry in the \"antecedants\" and \"consequents\" columns are\n", + " of type `frozenset`, which is a Python built-in type that\n", + " behaves similarly to sets except that it is immutable\n", + " (For more info, see\n", + " https://docs.python.org/3.6/library/stdtypes.html#frozenset).\n", "\n", "**Examples**\n", "\n",