In [3]:
# import the libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from csv import reader
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [6]:
## reading the dataset
groceries = []
with open('groceries.csv', 'r') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        groceries.append(row)

In [7]:
# fitting the list and converting the transactions to true and false
encoder = TransactionEncoder()
transactions = encoder.fit(groceries).transform(groceries)

In [8]:
# converting the true and false to 1 and 0
transactions = transactions.astype('int')
# converting the transactions array to a datafrmae
df = pd.DataFrame(transactions, columns=encoder.columns_)
# viewing the first few rows of the dataframe
df.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
# finding the dimensions of the dataframe
df.shape

(9835, 169)

In [10]:
# applying the apriori algorithm
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets



Unnamed: 0,support,itemsets,length
0,0.033452,(UHT-milk),1
1,0.052466,(beef),1
2,0.033249,(berries),1
3,0.026029,(beverages),1
4,0.080529,(bottled beer),1
...,...,...,...
117,0.032232,"(whole milk, whipped/sour cream)",2
118,0.020742,"(yogurt, whipped/sour cream)",2
119,0.056024,"(whole milk, yogurt)",2
120,0.023183,"(root vegetables, whole milk, other vegetables)",3


In [12]:
# sorting the dataframe
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
# finding top 5 items with minimum support of 2%
frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.02) ][0:5]

Unnamed: 0,support,itemsets,length
57,0.255516,(whole milk),1
39,0.193493,(other vegetables),1
43,0.183935,(rolls/buns),1
49,0.174377,(soda),1
58,0.139502,(yogurt),1


In [13]:
# finding itemsets having length more than 1 and minimum support of 5%
frequent_itemsets[(frequent_itemsets['length'] > 1) & 
                  (frequent_itemsets['support'] >= 0.05)]

Unnamed: 0,support,itemsets,length
91,0.074835,"(whole milk, other vegetables)",2
103,0.056634,"(whole milk, rolls/buns)",2
119,0.056024,"(whole milk, yogurt)",2


In [14]:
# finding itemsets having length 2 and minimum support of 2%
frequent_itemsets[(frequent_itemsets['length'] == 2) & 
                  (frequent_itemsets['support'] >= 0.02)]

Unnamed: 0,support,itemsets,length
91,0.074835,"(whole milk, other vegetables)",2
103,0.056634,"(whole milk, rolls/buns)",2
119,0.056024,"(whole milk, yogurt)",2
106,0.048907,"(root vegetables, whole milk)",2
85,0.047382,"(root vegetables, other vegetables)",2
...,...,...,...
75,0.020539,"(whole milk, frankfurter)",2
60,0.020437,"(whole milk, bottled beer)",2
76,0.020437,"(whole milk, frozen vegetables)",2
96,0.020437,"(tropical fruit, pip fruit)",2
