In [1]:
import numpy as np
import pandas as pd
import re

### apriori analysis

In [2]:
def data_gen(dataset):
    """generator for removing spaces and separating
    items with commas
    """
    for row in dataset.iloc[:,0]:
        yield row.replace(" ", "").split(",")


def row_gen(dataset):
    """generator for removing spaces and separating
    items with commas
    """
    for row in dataset:
        yield row.replace(" ", "").split(",")


def threshold(data, min_sup):
    """get dictionary of counts of frequent items in data
    """
    return {item:count for item, count in data.items() if count >= min_sup}


def union_set(datadict):
    """joining individual items from previous itemset
    """
    return set([','.join((i, j)) for i in datadict for j in datadict if i < j])


def apriori(dataset, min_sup):
    """scan datasets for up to frequent 3-itemsets
    """
    
    freq1_items = set()
    for data in data_gen(dataset):
        for item in data:
            freq1_items.add(item)

    scan1 = dict.fromkeys(freq1_items, 0)
    for data in data_gen(dataset):
        for item in data:
            scan1[item] += 1
    scan1 = threshold(scan1, min_sup)
    scan1 = dict(sorted(scan1.items()))

    unionset1 = union_set(scan1)
    scan2 = dict.fromkeys(unionset1, 0)
    for data in data_gen(dataset):
        for row in scan2:
            if set(row.split(',')).issubset(set(data)):
                scan2[row] += 1
    scan2 = threshold(scan2, min_sup)
    scan2 = dict(sorted(scan2.items()))

    unionset2 = union_set(scan2)
    newunion2 = set()
    for row in unionset2:
        newset = set(sorted(row.split(',')))
        for col in row_gen(scan2):
            if set(col).issubset(newset) and scan2[','.join(col)] >= 3 and len(newset) == 3:
                newunion2.add(','.join(newset))
    scan3 = dict.fromkeys(newunion2, 0)

    for data in data_gen(dataset):
        for row in scan3:
            if set(row.split(',')).issubset(set(data)):
                scan3[row] += 1
    scan3 = threshold(scan3, min_sup)
    scan3 = dict(sorted(scan3.items()))

    print('frequent-1 itemset:', scan1)
    print('frequent-2 itemset:', scan2)
    print('frequent-3 itemset:', scan3)
    
    return scan1, scan2, scan3


def apriori_assoc(df, datadict1, datadict2, label):

    # support
    splitlabel = label.split(',')
    for key, _ in datadict2.items():
        if set(splitlabel).issubset(key.split(',')):
            labl = key
        else:
            continue
    supp = datadict2[labl]/len(df.index)

    # confidence
    for key1, val1 in datadict1.items():
        if set(splitlabel[:-1]).issubset(key1.split(',')):
            for key2, val2 in datadict2.items():
                if set(splitlabel).issubset(key2.split(',')):
                    conf = val2/val1
    string = f'{splitlabel[:-1]} -> {splitlabel[-1]}'
    return string, supp, conf

#### apriori analysis examples

In [3]:
# test 1
dataset = pd.read_csv('datasets/apriori_ex.csv', index_col=0, header=0)
min_sup = 0.50
min_sup *= len(dataset)
scan1, scan2, scan3 = apriori(dataset, min_sup)
# itemset: {'E', 'D', 'C', 'A', 'B'}
# frequent-1 itemset: {'A': 2, 'B': 3, 'C': 3, 'E': 3}
# frequent-2 itemset: {'A,C': 2, 'B,C': 2, 'B,E': 3, 'C,E': 2}
# frequent-3 itemset: {'C,B,E': 2}

frequent-1 itemset: {'A': 2, 'B': 3, 'C': 3, 'E': 3}
frequent-2 itemset: {'A,C': 2, 'B,C': 2, 'B,E': 3, 'C,E': 2}
frequent-3 itemset: {'E,B,C': 2}


In [4]:

# test 2
dataset = pd.read_csv('datasets/apriori_ex2.csv', index_col=0, header=0)
min_sup = 0.60
min_sup *= len(dataset)
scan1, scan2, scan3 = apriori(dataset, min_sup)
# itemset: {'Z', 'N', 'B', 'E', 'D', 'I', 'T', 'O', 'G', 'S', 'F'}
# frequent-1 itemset: {'B': 4, 'E': 3, 'G': 3, 'I': 3, 'N': 4, 'Z': 3}
# frequent-2 itemset: {'B,I': 3, 'B,N': 4, 'G,Z': 3, 'I,N': 3}
# frequent-3 itemset: {'I,B,N': 3}

frequent-1 itemset: {'B': 4, 'E': 3, 'G': 3, 'I': 3, 'N': 4, 'Z': 3}
frequent-2 itemset: {'B,I': 3, 'B,N': 4, 'G,Z': 3, 'I,N': 3}
frequent-3 itemset: {'N,B,I': 3}


In [5]:
# test 3
dataset = pd.read_csv('datasets/apriori_ex3.csv', index_col=0, header=0)
min_sup = 0.60
min_sup *= len(dataset)
scan1, scan2, scan3 = apriori(dataset, min_sup)
# itemset: {'pie', 'bread', 'cereal', 'cheese', 'milk', 'cherry'}
# frequent-1 itemset: {'bread': 4, 'cheese': 3, 'milk': 4, 'pie': 3}
# frequent-2 itemset: {'bread,cheese': 3, 'bread,milk': 4, 'bread,pie': 3, 'cheese,milk': 3, 'milk,pie': 3}
# frequent-3 itemset: {'bread,cheese,milk': 3, 'pie,bread,milk': 3}

frequent-1 itemset: {'bread': 4, 'cheese': 3, 'milk': 4, 'pie': 3}
frequent-2 itemset: {'bread,cheese': 3, 'bread,milk': 4, 'bread,pie': 3, 'cheese,milk': 3, 'milk,pie': 3}
frequent-3 itemset: {'bread,cheese,milk': 3, 'pie,bread,milk': 3}


#### association rules (support/confidence)

In [6]:
# test 4
# association rules
tofind = ['bread,milk,pie', 'milk,pie,bread', 'bread,pie,milk', 
          'bread,milk,cheese','cheese,milk,bread', 'bread,cheese,milk']
for sets in tofind:
    assoc_str, supp, conf = apriori_assoc(df=dataset, datadict1=scan2, 
                                        datadict2=scan3, label=sets)
    print(assoc_str, 'support:', supp, 'confidence:', conf)
    # ['bread', 'milk'] -> pie support: 0.75 confidence: 0.75
    # ['milk', 'pie'] -> bread support: 0.75 confidence: 1.0
    # ['bread', 'pie'] -> milk support: 0.75 confidence: 1.0
    # ['bread', 'milk'] -> cheese support: 0.75 confidence: 0.75
    # ['cheese', 'milk'] -> bread support: 0.75 confidence: 1.0
    # ['bread', 'cheese'] -> milk support: 0.75 confidence: 1.0

['bread', 'milk'] -> pie support: 0.75 confidence: 0.75
['milk', 'pie'] -> bread support: 0.75 confidence: 1.0
['bread', 'pie'] -> milk support: 0.75 confidence: 1.0
['bread', 'milk'] -> cheese support: 0.75 confidence: 0.75
['cheese', 'milk'] -> bread support: 0.75 confidence: 1.0
['bread', 'cheese'] -> milk support: 0.75 confidence: 1.0
