# <center> 4. Association Rules

- Association rules functions
- Evaluation example

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
##### Jupyter notebook theme setup:
# !pip install jupyterthemes
!jt -t gruvboxd -fs 95 -tfs 11 -nfs 115 -cellw 80% -T
##### Reset theme:
# !jt -r
##### Plot style:
from jupyterthemes import jtplot
jtplot.style()

# Reload page after cell evaluation

# Data Import

In [3]:
df = pd.read_csv('data/store_data.csv', header=None)

In [4]:
df = df.apply(lambda x: list(set(x.dropna().tolist())), axis=1)\
    .to_frame('items')\
    .explode('items')\
    .set_index('items', append=True)\
    .assign(value=1)\
    .unstack()
df.columns = df.columns.droplevel(0)

In [5]:
# Creating DF of 0-es and 1-es.
one_zero_df = df.fillna(0)
one_zero_df.mean(axis=0).sort_values().tail(10)

items
pancakes             0.095054
frozen vegetables    0.095321
ground beef          0.098254
milk                 0.129583
green tea            0.132116
chocolate            0.163845
french fries         0.170911
spaghetti            0.174110
eggs                 0.179709
mineral water        0.238368
dtype: float64

# Apriori algorithm

### Functions

In [6]:
# Support
def support(df: pd.DataFrame, A_list: list) -> float:
    return df[A_list].all(axis=1).mean()

# Confidence
def confidence(df: pd.DataFrame, A_list: list, B_list: list) -> float:
    return support(df, A_list + B_list) / support(df, A_list)

# Lift definer
def lift(df: pd.DataFrame, A_list: list, B_list: list) -> float:
    return confidence(df, A_list, B_list) / support(df, B_list)

In [7]:
# K initialization
# One-product bins
def initialize_sets(df: pd.DataFrame, products, t: float) -> frozenset:
    return frozenset(([frozenset([p]) for p in products if support(df, [p]) > t]))

In [8]:
# One step of sets generation adding one element
def step_sets(df: pd.DataFrame, sets: frozenset, t: float) -> frozenset:
    survived_products = frozenset().union(*sets)
    out_set = set()
    for s in sets:
        for prod in survived_products.difference(s):
            out_set.add(s.union([prod]))
    return frozenset(filter(lambda x: support(df, x) > t, out_set))

In [9]:
# Lift checker
def lift_satisf(df: pd.DataFrame, sets: frozenset, a: float, products: frozenset) -> set:
    out_rules = set()
    for s in sets:
        for prod in products.difference(s):
            if lift(df, list(s), [prod]) > a:
                out_rules.add(((s, prod), '{0} => {1}'.format(list(s), prod)))
    return out_rules

In [10]:
# Algorithm realization
def apriori_algorithm(df: pd.DataFrame, t: float, a: float, products: frozenset) -> set:
    K = initialize_sets(df, products, t=t)
    rules = set()
    for _ in range(2, len(K)):
        K = step_sets(df, K, t=t)
        if K is frozenset():
            break
        rules = rules.union(lift_satisf(df, K, a=a, products=products))
    return tuple(rules)

### Constants


In [11]:
t = 0.025 # Support threshold
a = 1.5    # Lift threshold
products = frozenset(one_zero_df.columns) # frozenset of all products

### Test

In [12]:
for x in [
        support(one_zero_df, ['avocado', 'almonds', 'milk']),
        confidence(one_zero_df, ['avocado','almonds'], ['milk']),
        lift(one_zero_df, ['avocado','almonds'], ['milk']),
    ]:
    print(x)

0.0005332622317024396
0.3076923076923077
2.374485596707819


In [13]:
result = apriori_algorithm(one_zero_df, t=t, a=a, products=products)
for i, (x, y) in enumerate(result):
    print ('rule {0}: {1}'.format(i, y))

rule 0: ['frozen vegetables', 'mineral water'] => asparagus
rule 1: ['pancakes', 'spaghetti'] => rice
rule 2: ['mineral water', 'milk'] => meatballs
rule 3: ['pancakes', 'spaghetti'] => milk
rule 4: ['spaghetti', 'chocolate'] => salmon
rule 5: ['spaghetti', 'eggs'] => bacon
rule 6: ['chocolate', 'milk'] => chicken
rule 7: ['mineral water', 'chocolate'] => chicken
rule 8: ['spaghetti', 'mineral water'] => tea
rule 9: ['spaghetti', 'mineral water'] => herb & pepper
rule 10: ['spaghetti', 'ground beef'] => hot dogs
rule 11: ['spaghetti', 'mineral water'] => tomato sauce
rule 12: ['spaghetti', 'milk'] => asparagus
rule 13: ['mineral water', 'eggs'] => mayonnaise
rule 14: ['olive oil', 'mineral water'] => ham
rule 15: ['french fries', 'mineral water'] => tomato sauce
rule 16: ['spaghetti', 'eggs'] => fresh tuna
rule 17: ['spaghetti', 'mineral water'] => protein bar
rule 18: ['pancakes', 'mineral water'] => grated cheese
rule 19: ['french fries', 'chocolate'] => oatmeal
rule 20: ['olive oil'

It is difficult to asses whole result, but pure sets like mineral water and spagetti and products, that "are good with them" could be distinguished.

Also it is important to undestrand, that number of sets depends greatly on $t$ и $a$.

In [14]:
Counter(list(map(lambda x: x[0][0], result))).most_common()

[(frozenset({'pancakes', 'spaghetti'}), 76),
 (frozenset({'french fries', 'spaghetti'}), 74),
 (frozenset({'frozen vegetables', 'spaghetti'}), 71),
 (frozenset({'mineral water', 'spaghetti'}), 70),
 (frozenset({'french fries', 'mineral water'}), 70),
 (frozenset({'milk', 'spaghetti'}), 69),
 (frozenset({'green tea', 'spaghetti'}), 69),
 (frozenset({'chocolate', 'spaghetti'}), 68),
 (frozenset({'frozen vegetables', 'mineral water'}), 67),
 (frozenset({'mineral water', 'pancakes'}), 67),
 (frozenset({'milk', 'mineral water'}), 65),
 (frozenset({'mineral water', 'olive oil'}), 64),
 (frozenset({'green tea', 'mineral water'}), 63),
 (frozenset({'ground beef', 'spaghetti'}), 62),
 (frozenset({'ground beef', 'mineral water'}), 60),
 (frozenset({'eggs', 'mineral water'}), 59),
 (frozenset({'eggs', 'spaghetti'}), 58),
 (frozenset({'chocolate', 'milk'}), 58),
 (frozenset({'french fries', 'green tea'}), 54),
 (frozenset({'eggs', 'milk'}), 54),
 (frozenset({'cake', 'mineral water'}), 53),
 (froze