In [None]:
# install the library for association rule mining

In [2]:
!pip install apyori

Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25ldone
[?25h  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5955 sha256=d93dfa12ec8d9760748fcea2fe827326dcbc60e5375ee29e1e7bd046fea4959b
  Stored in directory: /Users/taekholee/Library/Caches/pip/wheels/32/2a/54/10c595515f385f3726642b10c60bf788029e8f3a1323e3913a
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [None]:
# import libraries

In [11]:
from apyori import apriori
from pprint import pprint  # this will help pretty printing

import numpy as np
import pandas as pd

In [9]:
# my example transaction dataset

In [12]:
# create example data for association rule mining
trans = [['b', 'c', 'g'],
        ['a', 'b', 'd', 'e', 'f'],
        ['a', 'b', 'c', 'g'],
        ['b', 'c', 'e', 'f'],
        ['b', 'c', 'e', 'f', 'g']]

pprint(trans)

[['b', 'c', 'g'],
 ['a', 'b', 'd', 'e', 'f'],
 ['a', 'b', 'c', 'g'],
 ['b', 'c', 'e', 'f'],
 ['b', 'c', 'e', 'f', 'g']]


In [14]:
# compare the print result.
print(trans)

[['b', 'c', 'g'], ['a', 'b', 'd', 'e', 'f'], ['a', 'b', 'c', 'g'], ['b', 'c', 'e', 'f'], ['b', 'c', 'e', 'f', 'g']]


In [16]:
# here is the documentation of apriori

#     Executes Apriori algorithm and returns a RelationRecord generator.

#     Arguments:
#         transactions -- A transaction iterable object
#                         (eg. [['A', 'B'], ['B', 'C']]).

#     Keyword arguments:
#         min_support -- The minimum support of relations (float).
#         min_confidence -- The minimum confidence of relations (float).
#         min_lift -- The minimum lift of relations (float).
#         max_length -- The maximum length of the relation (integer).

In [17]:
# Default values for four parameters are 0.1, 0.0, 0.0, and None in order.

In [18]:
min_supp = 0.5
min_conf = 0.6
min_lift = 1.01

rules = apriori(trans, min_support=min_supp, min_confidence=min_conf, min_lift=min_lift)
results = list(rules)  # to get the results, we need to make it as list.
print(results[0])

RelationRecord(items=frozenset({'c', 'g'}), support=0.6, ordered_statistics=[OrderedStatistic(items_base=frozenset({'c'}), items_add=frozenset({'g'}), confidence=0.7499999999999999, lift=1.2499999999999998), OrderedStatistic(items_base=frozenset({'g'}), items_add=frozenset({'c'}), confidence=1.0, lift=1.25)])


In [24]:
type(results[0].ordered_statistics)

list

In [26]:
# Each element of results is about each frequent itemset.

# If we look the results[0] in detail, it has three elements: items, support, and ordered_statistics.

# 'items' is one of the discovered frequent itemsets.
# 'support' is the relative frequency of the itemset.

# For each frequent itemset, we need to split the itemset into conditions and results in order to 
# extract rules (condition -> result). We can extract several candidate rules from each itemset, 
# 'ordered_statistics' is list of extracted rules that satisfy our pre-specified parameters
# (such as minimum confidence, minimum lift, and so on).

# each element of 'ordered_statistics' consists of four rule information: condition of the rule (items_base), 
# result of the rule (items_add), confidence of the rule (confidence), and lift of the rule (lift).

# let's see for the first element, results[0].s

In [28]:
line = results[0]
itemset = line.items  # frequent itemset corresponding to 'results[0]'.
print(itemset)

n_sub = len(line.ordered_statistics)
print("# rules:", n_sub)  # there are 'n_sub' rules extracted from the frequent itemset 'results[0]'.

for sub_rule_i in range(n_sub):
    # get and print details for each rule from 'results[0]'.
    
    a = line.ordered_statistics[sub_rule_i].items_base  # condition of the rule
    b = line.ordered_statistics[sub_rule_i].items_add  # result of the rule
    supp = line.support  # support of the frequent itemset (not specific to 'sub_rule_i')
    conf = line.ordered_statistics[sub_rule_i].confidence  # confidence of the rule
    lift = line.ordered_statistics[sub_rule_i].lift  # lift of the rule

    print(f"{a} -> {b} | support: {supp:.3f} | confidence: {conf:.3f} | lift: {lift:.3f}")

frozenset({'c', 'g'})
# rules: 2
frozenset({'c'}) -> frozenset({'g'}) | support: 0.600 | confidence: 0.750 | lift: 1.250
frozenset({'g'}) -> frozenset({'c'}) | support: 0.600 | confidence: 1.000 | lift: 1.250


In [29]:
# make the above code that prints rules as function.

In [36]:
def print_rule(line):
    print("="*30)
    freq_itemset = line.items
    print("Frequent itemset:", set(freq_itemset))

    n_rule = len(line.ordered_statistics)
    print("The number of rules:", n_rule)

    for i in range(n_rule):
        a = line.ordered_statistics[i].items_base
        b = line.ordered_statistics[i].items_add
        supp = line.support
        conf = line.ordered_statistics[i].confidence
        lift = line.ordered_statistics[i].lift
    
        print(f"{a} -> {b} | support: {supp:.3f} | confidence: {conf:.3f} | lift: {lift:.3f}")
    
    return

In [37]:
for line in results:
    print_rule(line)

Frequent itemset: {'c', 'g'}
The number of rules: 2
frozenset({'c'}) -> frozenset({'g'}) | support: 0.600 | confidence: 0.750 | lift: 1.250
frozenset({'g'}) -> frozenset({'c'}) | support: 0.600 | confidence: 1.000 | lift: 1.250
Frequent itemset: {'f', 'e'}
The number of rules: 2
frozenset({'e'}) -> frozenset({'f'}) | support: 0.600 | confidence: 1.000 | lift: 1.667
frozenset({'f'}) -> frozenset({'e'}) | support: 0.600 | confidence: 1.000 | lift: 1.667
Frequent itemset: {'g', 'c', 'b'}
The number of rules: 4
frozenset({'c'}) -> frozenset({'b', 'g'}) | support: 0.600 | confidence: 0.750 | lift: 1.250
frozenset({'g'}) -> frozenset({'c', 'b'}) | support: 0.600 | confidence: 1.000 | lift: 1.250
frozenset({'c', 'b'}) -> frozenset({'g'}) | support: 0.600 | confidence: 0.750 | lift: 1.250
frozenset({'g', 'b'}) -> frozenset({'c'}) | support: 0.600 | confidence: 1.000 | lift: 1.250
Frequent itemset: {'f', 'e', 'b'}
The number of rules: 4
frozenset({'e'}) -> frozenset({'f', 'b'}) | support: 0.600

In [38]:
# apply this to real-world datasets.

In [42]:
# load 100,000 rows for fast practice.
df = pd.read_csv("./datasets/order_products__train.csv", nrows=100_000)

In [43]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [44]:
# this dataset includes order id and product id in an order of their addition to the cart. 
# It also includes information on whether it is reordered or not.

# for association rule mining, we need to convert this dataset into forms of transaction data.
# Transaction data is a list of transactions where each transaction is a list of products.

# Therefore, we need to group product_id for the same order_id.

In [45]:
# method 1. use groupby provided in pandas

In [None]:
# to use groupby, we need a function that takes dataframe consisting of the same order_id as an input
# and outputs list of product_id.

In [46]:
# let's take an example with order_id is 1.
sub_df = df[df["order_id"] == 1]
sub_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
5,1,13176,6,0
6,1,47209,7,0
7,1,22035,8,1


In [47]:
# make list of product_id
sub_df["product_id"].tolist()

[49302, 11109, 10246, 49683, 43633, 13176, 47209, 22035]

In [48]:
def make_product_list(df_order):
    """
    Args:
        df_order: DataFrame
            sub-group of the whole dataset with the same order_id.
    
    Notes:
        Here, the input 'df_order' is corresponds to the 'sub_df' in the above example.
    """
    return df_order["product_id"].tolist()

In [53]:
transactions = df.groupby(by="order_id").apply(make_product_list, include_groups=False)

In [54]:
transactions

order_id
1         [49302, 11109, 10246, 49683, 43633, 13176, 472...
36        [39612, 19660, 49235, 43086, 46620, 34497, 486...
38        [11913, 18159, 4461, 21616, 23622, 32433, 2884...
96        [20574, 30391, 40706, 25610, 27966, 24489, 39275]
98        [8859, 19731, 43654, 13176, 4357, 37664, 34065...
                                ...                        
246741     [24852, 27307, 49628, 42736, 8239, 28934, 13409]
246768    [34049, 24830, 43961, 21137, 9484, 42093, 5456...
246788    [44623, 34739, 35669, 3358, 45548, 19133, 4174...
246796    [11422, 21903, 29600, 15934, 8532, 7877, 4853,...
246834            [33000, 13176, 27104, 6046, 35951, 24810]
Length: 9477, dtype: object

In [56]:
# finally, we can make list of transactions as below.
trans_list = transactions.tolist()

In [58]:
# method 2. sort-based

In [59]:
# this sort-based transformation is manually designed for reducing computations.
# since groupby is split-apply-combine approach is somewhat complex.

In [None]:
# the concept is simple. It takes sort and split approach.

In [60]:
# at first, sort the dataframe by order_id to gather the information of the same order.
sorted_df = df.sort_values(by="order_id")
sorted_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
99996,246834,27104,3,1
99998,246834,35951,5,1
99994,246834,33000,1,1
99995,246834,13176,2,1


In [61]:
# to split different orders we need to capture the rows where order_id changes.

In [62]:
order_ids = sorted_df["order_id"].to_numpy()

In [66]:
# find rows where their next order_id is not same.
# their location indicate the last position of each order_id.
last_indexes = np.where(order_ids[:-1] != order_ids[1:])[0]
# next location of last indexes are the start locations of next order.
start_indexes = last_indexes + 1

# now we need to add missing indexes.
# the first row in the overall sorted dataframe is also the start index.
start_indexes = np.append(0, start_indexes)

# also, the last row in the overall sorted dataframe is also the last index.
last_indexes = np.append(last_indexes, sorted_df.shape[0] - 1)

In [70]:
# extract products between each start_index and last index.
product_ids = df["product_id"].tolist()

# list comprehension. (if this is not familiar, make an empty list and append each transaction by iterating for loop.)
# here, 'zip' groups and returns start_indexes and last_indexes.
trans_list_v2 = [
    product_ids[start_index:last_index + 1] for start_index, last_index in zip(start_indexes, last_indexes)
]

In [72]:
trans_list_v2[:3]

[[49302, 11109, 10246, 49683, 43633, 13176, 47209, 22035],
 [39612, 19660, 49235, 43086, 46620, 34497, 48679, 46979],
 [11913, 18159, 4461, 21616, 23622, 32433, 28842, 42625, 39693]]

In [73]:
# after this data format transformation, we can apply apriori directly.

In [76]:
list(apriori(trans_list, min_support=0.5, min_conf=0.5))

[]

In [78]:
# the reason for this trivial result is inappropriate parameters.
# we need to set proper values depending on the dataset used.

In [82]:
# to find proper values, we need to investigate the transaction dataset.
# we can use 'Counter' to count the number of products in transaction data.

In [80]:
from collections import Counter

product_counter = Counter()
for transaction in trans_list:
    product_counter.update(transaction)

In [84]:
pprint(product_counter.most_common(30))
# the method 'most_common' of Counter object returns tuples of (product_id, its count).

[(24852, 1338),
 (13176, 1130),
 (21137, 776),
 (21903, 705),
 (47626, 594),
 (47766, 555),
 (47209, 508),
 (16797, 470),
 (26209, 416),
 (27966, 388),
 (27845, 368),
 (30391, 352),
 (39275, 336),
 (45007, 325),
 (40706, 305),
 (4920, 303),
 (24964, 300),
 (22935, 300),
 (8518, 289),
 (45066, 278),
 (42265, 276),
 (44632, 274),
 (46979, 270),
 (5876, 270),
 (31717, 268),
 (4605, 245),
 (43352, 239),
 (19057, 233),
 (28204, 230),
 (5450, 224)]


In [None]:
# see the largest support in this transaction dataset.

In [85]:
1338 / len(trans_list)

0.14118391896169674

In [86]:
# as you can see here, the range of effective support to search frequent itemsets varies across datasets.

In [87]:
for product_id, product_count in product_counter.most_common(30):
    print(product_count / len(trans_list))

0.14118391896169674
0.11923604516197109
0.08188245225282262
0.0743906299461855
0.06267806267806268
0.05856283634061412
0.053603461010868415
0.049593753297457
0.0438957475994513
0.04094122612641131
0.03883085364566846
0.03714255566107418
0.035454257676479896
0.03429355281207133
0.03218318033132848
0.0319721430832542
0.03165558721114277
0.03165558721114277
0.030494882346734198
0.02933417748232563
0.029123140234251346
0.02891210298617706
0.02849002849002849
0.02849002849002849
0.028278991241954207
0.025852062889099928
0.025218951144877072
0.024585839400654216
0.02426928352854279
0.023636171784319933


In [88]:
# to encompass many key items, I set the smaller support.
min_supp = 0.001
min_conf = 0.6
min_lift = 1.01

In [89]:
rules = apriori(trans_list, min_support=min_supp, min_confidence=min_conf, min_lift=min_lift)
results = list(rules)

NameError: name 'reulsts' is not defined

In [90]:
for row in results:
    print_rule(row)

Frequent itemset: {9203, 8710}
The number of rules: 1
frozenset({8710}) -> frozenset({9203}) | support: 0.001 | confidence: 0.600 | lift: 270.771
Frequent itemset: {15842, 9203}
The number of rules: 1
frozenset({15842}) -> frozenset({9203}) | support: 0.001 | confidence: 0.667 | lift: 300.857
Frequent itemset: {15984, 48220}
The number of rules: 2
frozenset({15984}) -> frozenset({48220}) | support: 0.001 | confidence: 0.667 | lift: 394.875
frozenset({48220}) -> frozenset({15984}) | support: 0.001 | confidence: 0.625 | lift: 394.875
Frequent itemset: {24852, 4605, 16797}
The number of rules: 1
frozenset({4605, 16797}) -> frozenset({24852}) | support: 0.001 | confidence: 0.722 | lift: 5.115
Frequent itemset: {13176, 40706, 5876}
The number of rules: 1
frozenset({40706, 5876}) -> frozenset({13176}) | support: 0.001 | confidence: 0.632 | lift: 5.297
Frequent itemset: {13176, 21137, 7948}
The number of rules: 1
frozenset({21137, 7948}) -> frozenset({13176}) | support: 0.001 | confidence: 0.

In [91]:
# another dataset.

In [92]:
df = pd.read_csv("./datasets/random_shopping_cart.csv", names=["date", "id", "item"])
df.head()

Unnamed: 0,date,id,item
0,2000-01-01,1,yogurt
1,2000-01-01,1,pork
2,2000-01-01,1,sandwich bags
3,2000-01-01,1,lunch meat
4,2000-01-01,1,all- purpose


In [93]:
# you can practice the same transformation process and application of apriori algorithm for this dataset.