In [12]:
import numpy as np
import pandas as pd 
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

Dataset is consist of transaction data with the items bought.

### Data Preparation

In [13]:
df = pd.read_csv("GroceryStoreDataSet.csv", header=None, names=['Grocery'])

In [14]:
df.head()

Unnamed: 0,Grocery
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"


In [16]:
# data modification
df['trx_ID']=range(len(df))
df=df.set_index('trx_ID')

In [17]:
df.head(10)

Unnamed: 0_level_0,Grocery
trx_ID,Unnamed: 1_level_1
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"
5,"BREAD,TEA,BOURNVITA"
6,"MAGGI,TEA,CORNFLAKES"
7,"MAGGI,BREAD,TEA,BISCUIT"
8,"JAM,MAGGI,BREAD,TEA"
9,"BREAD,MILK"


In [20]:
data = list(df['Grocery'].apply(lambda x:x.split(",")))

In [22]:
data[:10]

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK']]

In [23]:
from mlxtend.preprocessing import TransactionEncoder

tencoder = TransactionEncoder()
te_data = tencoder.fit(data).transform(data)
df = pd.DataFrame(te_data, columns=tencoder.columns_)

In [24]:
df.head()

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,True,False,True,False,False,False,False,False,True,False,False
1,True,False,True,False,False,True,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,False,True
3,False,False,True,False,False,False,True,True,True,False,False
4,True,False,False,False,False,False,False,True,False,False,True


### Support Analysis

using min support = 0.2

In [5]:
from mlxtend.frequent_patterns import apriori,  association_rules

df1 = apriori(df, min_support=0.2, use_colnames=True)
df1=df1[['itemsets','support']]
df1.sort_values(by='support', ascending=False)

Unnamed: 0,itemsets,support
2,(BREAD),0.65
3,(COFFEE),0.4
0,(BISCUIT),0.35
8,(TEA),0.35
4,(CORNFLAKES),0.3
7,(SUGER),0.3
5,(MAGGI),0.25
6,(MILK),0.25
1,(BOURNVITA),0.2
9,"(BISCUIT, BREAD)",0.2


In [6]:
df1['length'] = df1['itemsets'].apply(lambda x:len(x))
df1=df1.sort_values(["length", "support"], ascending = (False, False))
df1

Unnamed: 0,itemsets,support,length
9,"(BISCUIT, BREAD)",0.2,2
10,"(MILK, BREAD)",0.2,2
11,"(SUGER, BREAD)",0.2,2
12,"(TEA, BREAD)",0.2,2
13,"(COFFEE, CORNFLAKES)",0.2,2
14,"(COFFEE, SUGER)",0.2,2
15,"(TEA, MAGGI)",0.2,2
2,(BREAD),0.65,1
3,(COFFEE),0.4,1
0,(BISCUIT),0.35,1


In [7]:
df1[df1['length'] == 2]

Unnamed: 0,itemsets,support,length
9,"(BISCUIT, BREAD)",0.2,2
10,"(MILK, BREAD)",0.2,2
11,"(SUGER, BREAD)",0.2,2
12,"(TEA, BREAD)",0.2,2
13,"(COFFEE, CORNFLAKES)",0.2,2
14,"(COFFEE, SUGER)",0.2,2
15,"(TEA, MAGGI)",0.2,2


### Confidence & Lift Analysis

using min confidence 0.5

In [8]:
df_association = association_rules(df1, metric = 'confidence', min_threshold=0.5)
df_association.sort_values(['lift','confidence'], ascending=(False,False))

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
9,(MAGGI),(TEA),0.25,0.35,0.2,0.8,2.285714,0.1125,3.25
8,(TEA),(MAGGI),0.35,0.25,0.2,0.571429,2.285714,0.1125,1.75
5,(CORNFLAKES),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
7,(SUGER),(COFFEE),0.3,0.4,0.2,0.666667,1.666667,0.08,1.8
4,(COFFEE),(CORNFLAKES),0.4,0.3,0.2,0.5,1.666667,0.08,1.4
6,(COFFEE),(SUGER),0.4,0.3,0.2,0.5,1.666667,0.08,1.4
1,(MILK),(BREAD),0.25,0.65,0.2,0.8,1.230769,0.0375,1.75
2,(SUGER),(BREAD),0.3,0.65,0.2,0.666667,1.025641,0.005,1.05
0,(BISCUIT),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667
3,(TEA),(BREAD),0.35,0.65,0.2,0.571429,0.879121,-0.0275,0.816667


# Conclusion

There are several itemsets with high association (pass the min support = 0.2 and min confidence = 0.5)

- Maggi and Tea (lift = 2.29)
- Cornflakes and Coffee (lift = 2.29)
- Sugar and Coffee (lift = 1.67)
- Milk and Bread (lift = 1.23)

As the follow-up for the output:
- recommendation system by using targeted advertising
- Bundling or discount campaign for the itemset with high association (can leverage the sales of the item in itemset)
- being the reference for store layout
- as a reference for stock allocation