In [14]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [15]:
df = pd.read_csv("Groceries_dataset.csv")
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [16]:
df_grouped = df.groupby(['Member_number','Date'])['itemDescription'].apply(list).reset_index()
df_grouped

Unnamed: 0,Member_number,Date,itemDescription
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"
...,...,...,...
14958,4999,24-01-2015,"[tropical fruit, berries, other vegetables, yo..."
14959,4999,26-12-2015,"[bottled water, herbs]"
14960,5000,09-03-2014,"[fruit/vegetable juice, onions]"
14961,5000,10-02-2015,"[soda, root vegetables, semi-finished bread]"


In [17]:
transactions = df_grouped['itemDescription'].tolist()

In [18]:
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)
df_encoded = pd.DataFrame(df_encoded, columns=te.columns_)
df_encoded

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Support tells u how frequently an itemet appears in the dataset

In [19]:
frequent_itemsets = apriori(df_encoded, min_support=0.002, use_colnames=True)
print(frequent_itemsets)

      support                          itemsets
0    0.004010           (Instant food products)
1    0.021386                        (UHT-milk)
2    0.008087                   (baking powder)
3    0.033950                            (beef)
4    0.021787                         (berries)
..        ...                               ...
325  0.002606             (waffles, whole milk)
326  0.004611  (whipped/sour cream, whole milk)
327  0.002941      (whipped/sour cream, yogurt)
328  0.003141         (white bread, whole milk)
329  0.011161              (yogurt, whole milk)

[330 rows x 2 columns]


```Confidence``` tells us how likely it is that a customer will buy item B, given that they have already bought A

In [20]:
#Generating association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

```Lift``` tells us how much more likely item b is purchased when A is purchased ,compared to when A is not purchased

In [21]:
rules_sorted = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False)

In [22]:
print("Top 10 Association Rules :\n")
print(rules_sorted.head(10))

Top 10 Association Rules :

                antecedents         consequents   support  confidence  \
23            (frankfurter)  (other vegetables)  0.005146    0.136283   
14              (chocolate)        (rolls/buns)  0.002807    0.118980   
25           (frozen meals)  (other vegetables)  0.002139    0.127490   
35                   (meat)  (other vegetables)  0.002139    0.126984   
30                    (ham)        (whole milk)  0.002740    0.160156   
3                 (berries)  (other vegetables)  0.002673    0.122699   
28  (fruit/vegetable juice)        (rolls/buns)  0.003743    0.110020   
6            (bottled beer)        (whole milk)  0.007151    0.157817   
42            (salty snack)  (other vegetables)  0.002205    0.117438   
10                  (candy)        (whole milk)  0.002139    0.148837   

        lift  
23  1.116150  
14  1.081592  
25  1.044134  
35  1.039991  
30  1.014142  
3   1.004899  
28  1.000136  
6   0.999330  
42  0.961807  
10  0.942468  
