In [1]:
import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import association_rules, apriori

In [2]:
df = pd.read_csv('data/bread basket.csv')
df.head()

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend


In [3]:
df = df[['Transaction', 'Item']]

In [4]:
df.groupby(['Transaction', 'Item']).head()

Unnamed: 0,Transaction,Item
0,1,Bread
1,2,Scandinavian
2,2,Scandinavian
3,3,Hot chocolate
4,3,Jam
...,...,...
20502,9682,Coffee
20503,9682,Tea
20504,9683,Coffee
20505,9683,Pastry


In [5]:
item_count = df.groupby(['Transaction', 'Item'])['Item'].count().reset_index(name ='Count')
item_count

Unnamed: 0,Transaction,Item,Count
0,1,Bread,1
1,2,Scandinavian,2
2,3,Cookies,1
3,3,Hot chocolate,1
4,3,Jam,1
...,...,...,...
18882,9682,Tacos/Fajita,1
18883,9682,Tea,1
18884,9683,Coffee,1
18885,9683,Pastry,1


In [6]:
my_basket = item_count.pivot_table(index='Transaction', columns='Item', values='Count', aggfunc='sum').fillna(0)

my_basket.head()

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
my_basket[0:5]['Scandinavian']

Transaction
1    0.0
2    2.0
3    0.0
4    0.0
5    0.0
Name: Scandinavian, dtype: float64

In [8]:
def converter(X):
    if X <= 0:
        return 0
    else:
        return 1
    
my_basket = my_basket.applymap(converter)
my_basket[0:5]['Scandinavian']

Transaction
1    0
2    1
3    0
4    0
5    0
Name: Scandinavian, dtype: int64

In [20]:
len(my_basket)

3465

In [18]:
my_basket.drop(my_basket.tail(6000).index, inplace=True)

In [19]:
my_basket.to_csv('my_basket.csv', index=False)

In [11]:
my_basket

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
frequent_items = apriori(my_basket, min_support = 0.02,use_colnames = True)
print(len(frequent_items))
frequent_items[0:10]

33


Unnamed: 0,support,itemsets
0,0.036344,(Alfajores)
1,0.327205,(Bread)
2,0.040042,(Brownie)
3,0.103856,(Cake)
4,0.478394,(Coffee)
5,0.054411,(Cookies)
6,0.039197,(Farm House)
7,0.05832,(Hot chocolate)
8,0.038563,(Juice)
9,0.061807,(Medialuna)


In [14]:
rules = association_rules(frequent_items, metric = "lift", min_threshold = 1)
rules.sort_values('confidence', ascending = False, inplace = True)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
19,(Toast),(Coffee),0.033597,0.478394,0.023666,0.704403,1.472431,0.007593,1.764582
13,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,0.005614,1.210871
15,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,0.006351,1.164682
11,(Juice),(Coffee),0.038563,0.478394,0.020602,0.534247,1.11675,0.002154,1.119919
17,(Sandwich),(Coffee),0.071844,0.478394,0.038246,0.532353,1.112792,0.003877,1.115384
3,(Cake),(Coffee),0.103856,0.478394,0.054728,0.526958,1.101515,0.005044,1.102664
7,(Cookies),(Coffee),0.054411,0.478394,0.028209,0.518447,1.083723,0.002179,1.083174
9,(Hot chocolate),(Coffee),0.05832,0.478394,0.029583,0.507246,1.060311,0.001683,1.058553
1,(Pastry),(Bread),0.086107,0.327205,0.02916,0.33865,1.034977,0.000985,1.017305
5,(Cake),(Tea),0.103856,0.142631,0.023772,0.228891,1.604781,0.008959,1.111865


In [15]:
rules[rules['confidence']>=0.5]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
19,(Toast),(Coffee),0.033597,0.478394,0.023666,0.704403,1.472431,0.007593,1.764582
13,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,0.005614,1.210871
15,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,0.006351,1.164682
11,(Juice),(Coffee),0.038563,0.478394,0.020602,0.534247,1.11675,0.002154,1.119919
17,(Sandwich),(Coffee),0.071844,0.478394,0.038246,0.532353,1.112792,0.003877,1.115384
3,(Cake),(Coffee),0.103856,0.478394,0.054728,0.526958,1.101515,0.005044,1.102664
7,(Cookies),(Coffee),0.054411,0.478394,0.028209,0.518447,1.083723,0.002179,1.083174
9,(Hot chocolate),(Coffee),0.05832,0.478394,0.029583,0.507246,1.060311,0.001683,1.058553


In [10]:
import csv
import itertools

DataFile = open('data/grocery.csv', 'r')
minsup = 0.02
f2 = "Rules.txt"
f1 = "FItems.txt"
minconf = 0.3


def L1():
    '''
    Find frequent 1-itemsets
    '''
    #Get all 1-itemsets in the list items and their counts in the dictionary counts
    DataCaptured = csv.reader(DataFile, delimiter=',')
    data = list(DataCaptured)
    for e in data:
        e = sorted(e)
    count = {}
    for items in data:
        for item in items:
            if item not in count:
                count[(item)] = 1
            else:
                count[(item)] = count[(item)] + 1
    #print("C1 Items", count)
    print("C1 Length : ", len(count))
    print()

    #Thresholding
    count2 = {k: v for k, v in count.items() if v >= minsup*9835}
    #print("L1 Items : ", count2)
    print("L1 Length : ", len(count2))
    print()

    return count2, data


def generateCk(Lk_1, flag, data):
    '''
    Generate Ck by joining 2 Lk-1
    '''
    Ck = []

    if flag == 1:
        flag = 0
        for item1 in Lk_1:
            for item2 in Lk_1:
                if item2 > item1:
                    Ck.append((item1, item2))
        print("C2: ", Ck[1:3])
        print("length : ", len(Ck))
        print()

    else:
        for item in Lk_1:
            k = len(item)
        for item1 in Lk_1:
            for item2 in Lk_1:
                if (item1[:-1] == item2[:-1]) and (item1[-1] != item2[-1]):
                    if item1[-1] > item2[-1]:
                        Ck.append(item2 + (item1[-1],))
                    else:
                        Ck.append(item1 + (item2[-1],))
        print("C" + str(k+1) + ": ", Ck[1:3])
        print("Length : ", len(Ck))
        print()
    L = generateLk(set(Ck), data)
    return L, flag


def generateLk(Ck, data):
    '''
    If item in Ck belongs to a transaction,
    it makes it into list Ct
    Then Ct is thresholded to form L
    '''
    count = {}
    for itemset in Ck:
        #print(itemset)
        for transaction in data:
            if all(e in transaction for e in itemset):
                if itemset not in count:
                    count[itemset] = 1
                else:
                    count[itemset] = count[itemset] + 1

    print("Ct Length : ", len(count))
    print()

    count2 = {k: v for k, v in count.items() if v >= minsup*9835}
    print("L Length : ", len(count2))
    print()
    return count2


def rulegenerator(fitems):
    '''
    Generates association rules from the frequent itemsets
    '''
    counter = 0
    for itemset in fitems.keys():
        if isinstance(itemset, str):
            continue
        length = len(itemset)

        union_support = fitems[tuple(itemset)]
        for i in range(1, length):

            lefts = map(list, itertools.combinations(itemset, i))
            for left in lefts:
                if len(left) == 1:
                    if ''.join(left) in fitems:
                        leftcount = fitems[''.join(left)]
                        conf = union_support / leftcount
                else:
                    if tuple(left) in fitems:
                        leftcount = fitems[tuple(left)]
                        conf = union_support / leftcount
                if conf >= minconf:
                    fo = open(f2, "a+")
                    right = list(itemset[:])
                    for e in left:
                        right.remove(e)
                    fo.write(str(left) + ' (' + str(leftcount) + ')' + ' -> ' + str(right) + ' (' + str(fitems[''.join(right)]) + ')' + ' [' + str(conf) + ']' + '\n')
                    print(str(left) + ' -> ' + str(right) + ' (' + str(conf) + ')')
                    counter = counter + 1
                    #Greater than 1???
                    fo.close()
    print(counter, "rules generated")


def apriori():
    '''
    The runner function
    '''
    L, data = L1()
    flag = 1
    FreqItems = dict(L)
    while(len(L) != 0):
        fo = open(f1, "a+")
        for k, v in L.items():
            fo.write(str(k) + ' >>> ' + str(v) + '\n\n')
        fo.close()

        L, flag = generateCk(L, flag, data)
        FreqItems.update(L)
    rulegenerator(FreqItems)

In [4]:
apriori()

C1 Length :  169

L1 Length :  59

C2:  [('citrus fruit', 'coffee'), ('citrus fruit', 'tropical fruit')]
length :  1711

Ct Length :  1711

L Length :  61

C3:  [('citrus fruit', 'other vegetables', 'yogurt'), ('citrus fruit', 'whole milk', 'yogurt')]
Length :  222

Ct Length :  111

L Length :  2

C4:  []
Length :  0

Ct Length :  0

L Length :  0

['pork'] -> ['whole milk'] (0.3844797178130511)
['yogurt'] -> ['other vegetables'] (0.3112244897959184)
['fruit/vegetable juice'] -> ['whole milk'] (0.36849507735583686)
['sausage'] -> ['rolls/buns'] (0.32575757575757575)
['newspapers'] -> ['whole milk'] (0.34267515923566877)
['sausage'] -> ['whole milk'] (0.3181818181818182)
['yogurt'] -> ['whole milk'] (0.40160349854227406)
['domestic eggs'] -> ['other vegetables'] (0.35096153846153844)
['butter'] -> ['other vegetables'] (0.3614678899082569)
['root vegetables'] -> ['whole milk'] (0.44869402985074625)
['other vegetables'] -> ['whole milk'] (0.38675775091960063)
['domestic eggs'] -> ['whole

In [14]:
with open("grocery.csv", "rt") as fin:
    with open("out.csv", "wt") as fout:
        for line in fin:
            fout.write(line.replace(',,', ''))
            
with open('out.csv', 'rt') as fin:
    with open("out1.csv", 'wt') as fout:
        for line in fin:
            fout.write(line.replace(',\n', '\n'))