In [1]:
import collections
import itertools

# 1

In [2]:
class Apriori:
    frequent_sets = None
    
    def __init__(self, data):
        """data - collection of transactions"""
        self.data = [set(tx) for tx in data]
        self.n = len(self.data)

    def find_frequent_sets(self, min_supp):
        curr_sets = self._get_single_item_sets(min_supp)
        self.frequent_sets = curr_sets
        while curr_sets:
            curr_sets = self._join_sets(curr_sets, min_supp)
            self.frequent_sets.extend(curr_sets)
    
    def _get_single_item_sets(self, min_supp):
        item_counts = collections.defaultdict(int)
        for tx in self.data:
            for prod in tx:
                item_counts[prod] += 1
        return [
            [prod] for prod, count in item_counts.items()
            if count / self.n >= min_supp
        ]
    
    def _join_sets(self, sets, min_supp):
        new_sets = []
        for i, lhs in enumerate(sets):
            for rhs in sets[i + 1:]:
                if lhs[:-1] == rhs[:-1]:
                    cand = lhs + rhs[-1:]
                    if self.supp(set(cand)) >= min_supp:
                        new_sets.append(cand)
        return new_sets
    
    def supp(self, set_):
        return sum(set_ <= tx for tx in self.data) / self.n
 
    def gen_association_rules(self, min_conf):
        """This is a generator, more interactive"""
        return itertools.chain.from_iterable(
            self._find_rules_of(set_, min_conf)
            for set_ in self.frequent_sets
        )
    
    def _find_rules_of(self, set_, min_conf):
        return (
            (lhs, rhs) for lhs, rhs in self._partitions(set_)
            if self.confidence(lhs, rhs) >= min_conf
        )
    
    def _partitions(self, set_):
        set_ = set(set_)
        kinda_powerset = itertools.chain.from_iterable(
            itertools.combinations(set_, n) for n in range(1, len(set_))
        )
        return ((set(lhs), set_ - set(lhs)) for lhs in kinda_powerset)
    
    def confidence(self, lhs, rhs):
        return self.supp(lhs | rhs) / self.supp(lhs)
    
    def lift(self, lhs, rhs):
        return self.confidence(lhs, rhs) / self.supp(rhs)
    
    def leverage(self, lhs, rhs):
        return self.supp(lhs | rhs) - self.supp(lhs) * self.supp(rhs)

In [3]:
def test(data, min_supp, min_conf):
    a = Apriori(data)
    a.find_frequent_sets(min_supp)
    rules = []
    print('RULE CONFIDENCE LIFT LEVERAGE')
    for lhs, rhs in a.gen_association_rules(min_conf):
        rules.append((lhs, rhs))
        print(
            f'{lhs} -> {rhs}'
            f' {a.confidence(lhs, rhs):.2f}'
            f' {a.lift(lhs, rhs):.2f}'
            f' {a.leverage(lhs, rhs):.2f}'
        )
    return a, rules

In [4]:
# sample data from Wikipedia
wiki_data = [
    [1, 2, 3, 4],
    [1, 2, 4],
    [1, 2],
    [2, 3, 4],
    [2, 3],
    [3, 4],
    [2, 4],
]

a1, _ = test(wiki_data, 3 / 7, 0.5)
# simple test
assert a1.frequent_sets == [[1], [2], [3], [4], [1, 2], [2, 3], [2, 4], [3, 4]]

RULE CONFIDENCE LIFT LEVERAGE
{1} -> {2} 1.00 1.17 0.06
{2} -> {1} 0.50 1.17 0.06
{2} -> {3} 0.50 0.88 -0.06
{3} -> {2} 0.75 0.88 -0.06
{2} -> {4} 0.67 0.93 -0.04
{4} -> {2} 0.80 0.93 -0.04
{3} -> {4} 0.75 1.05 0.02
{4} -> {3} 0.60 1.05 0.02


In [5]:
def test_on_dataset(filename, min_supp, min_conf):
    """Must consist of lines of integers"""
    with open(f'assets/{filename}') as f:
        data = ((int(el) for el in line.split()) for line in f)
        test(data, min_supp, min_conf)

# 2

In [6]:
test_on_dataset('retail.dat', 0.02, 0.5)

RULE CONFIDENCE LIFT LEVERAGE
{32} -> {39} 0.56 0.97 -0.00
{32} -> {48} 0.53 1.11 0.01
{36} -> {38} 0.95 5.37 0.03
{36} -> {39} 0.69 1.21 0.00
{38} -> {39} 0.66 1.15 0.02
{38} -> {48} 0.51 1.07 0.01
{110} -> {38} 0.98 5.51 0.03
{170} -> {38} 0.98 5.53 0.03
{41} -> {39} 0.76 1.33 0.03
{48} -> {39} 0.69 1.20 0.06
{39} -> {48} 0.58 1.20 0.06
{65} -> {39} 0.62 1.08 0.00
{89} -> {39} 0.72 1.25 0.01
{170} -> {39} 0.66 1.16 0.00
{225} -> {39} 0.72 1.26 0.01
{237} -> {39} 0.64 1.11 0.00
{310} -> {39} 0.71 1.24 0.00
{41} -> {48} 0.60 1.26 0.02
{65} -> {48} 0.57 1.18 0.00
{89} -> {48} 0.73 1.53 0.01
{32, 38} -> {39} 0.65 1.13 0.00
{32, 41} -> {39} 0.74 1.28 0.01
{32, 48} -> {39} 0.67 1.17 0.01
{32, 39} -> {48} 0.64 1.34 0.02
{32, 41} -> {48} 0.65 1.35 0.01
{36} -> {38, 39} 0.66 5.65 0.02
{36, 38} -> {39} 0.70 1.21 0.00
{36, 39} -> {38} 0.95 5.40 0.02
{41, 38} -> {39} 0.78 1.36 0.01
{48, 38} -> {39} 0.77 1.34 0.02
{38, 39} -> {48} 0.59 1.23 0.01
{170} -> {38, 39} 0.65 5.55 0.02
{170, 38} -> {39} 

# 3

In [7]:
test_on_dataset('kosarak.dat', 0.05, 0.5)

RULE CONFIDENCE LIFT LEVERAGE
{1} -> {6} 0.67 1.10 0.01
{3} -> {6} 0.59 0.97 -0.01
{7} -> {6} 0.85 1.39 0.02
{11} -> {6} 0.89 1.47 0.10
{6} -> {11} 0.54 1.47 0.10
{27} -> {6} 0.82 1.36 0.02
{148} -> {6} 0.93 1.52 0.02
{218} -> {6} 0.88 1.44 0.02
{7} -> {11} 0.66 1.79 0.03
{148} -> {11} 0.80 2.17 0.03
{218} -> {11} 0.70 1.89 0.03
{218} -> {148} 0.66 9.40 0.05
{148} -> {218} 0.84 9.40 0.05
{1, 3} -> {6} 0.68 1.12 0.01
{1, 11} -> {6} 0.94 1.54 0.03
{1, 6} -> {11} 0.65 1.77 0.04
{3, 11} -> {6} 0.89 1.47 0.05
{3, 6} -> {11} 0.54 1.47 0.05
{7} -> {11, 6} 0.64 1.96 0.03
{11, 7} -> {6} 0.98 1.61 0.02
{6, 7} -> {11} 0.76 2.06 0.03
{148} -> {11, 6} 0.79 2.41 0.03
{11, 148} -> {6} 0.99 1.63 0.02
{148, 6} -> {11} 0.85 2.32 0.03
{218} -> {11, 6} 0.68 2.09 0.03
{218, 11} -> {6} 0.98 1.62 0.02
{218, 6} -> {11} 0.78 2.12 0.03
{218} -> {148, 6} 0.64 9.81 0.05
{148} -> {218, 6} 0.81 10.36 0.05
{218, 148} -> {6} 0.97 1.59 0.02
{218, 6} -> {148} 0.73 10.36 0.05
{148, 6} -> {218} 0.88 9.81 0.05
{218} -> {1

# 4