In [2]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

In [3]:
groceryData=pd.read_csv("data\GroceryStoreDataSet.csv",names=['Products'],header=None)

In [5]:
groceryData.head()

Unnamed: 0,Products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"


In [6]:
groceryData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Products  20 non-null     object
dtypes: object(1)
memory usage: 288.0+ bytes


In [7]:
items=[]
for i in groceryData.values:
    items.extend( i[0].split(",")  )
items=list(set(items))
items

['COCK',
 'BREAD',
 'BISCUIT',
 'MILK',
 'BOURNVITA',
 'MAGGI',
 'CORNFLAKES',
 'COFFEE',
 'TEA',
 'JAM',
 'SUGER']

In [8]:
df=pd.DataFrame(data=0,columns=items,index=range(len(groceryData)))
for i in df.columns:
    df[i] = groceryData['Products'].str.contains(i)
df

Unnamed: 0,COCK,BREAD,BISCUIT,MILK,BOURNVITA,MAGGI,CORNFLAKES,COFFEE,TEA,JAM,SUGER
0,False,True,True,True,False,False,False,False,False,False,False
1,False,True,True,True,False,False,True,False,False,False,False
2,False,True,False,False,True,False,False,False,True,False,False
3,False,True,False,True,False,True,False,False,False,True,False
4,False,False,True,False,False,True,False,False,True,False,False
5,False,True,False,False,True,False,False,False,True,False,False
6,False,False,False,False,False,True,True,False,True,False,False
7,False,True,True,False,False,True,False,False,True,False,False
8,False,True,False,False,False,True,False,False,True,True,False
9,False,True,False,True,False,False,False,False,False,False,False


The data frame is ready now. Lets dive into exploring the association between items. We will take 0.1 as min support value. That will eliminate any item has less than 0.1 support value.

In [9]:
df_freq = apriori(df, min_support = 0.1, use_colnames = True)
df_freq

Unnamed: 0,support,itemsets
0,0.15,(COCK)
1,0.65,(BREAD)
2,0.35,(BISCUIT)
3,0.25,(MILK)
4,0.2,(BOURNVITA)
5,0.25,(MAGGI)
6,0.3,(CORNFLAKES)
7,0.4,(COFFEE)
8,0.35,(TEA)
9,0.1,(JAM)


In [10]:
association_rules(df_freq, metric = "lift", min_threshold = 1).sort_values(by=['antecedent support','confidence'],ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(BREAD),(MILK),0.65,0.25,0.2,0.307692,1.230769,0.0375,1.083333
1,(BREAD),(SUGER),0.65,0.3,0.2,0.307692,1.025641,0.005,1.011111
2,(BREAD),(BOURNVITA),0.65,0.2,0.15,0.230769,1.153846,0.02,1.04
3,(BREAD),(JAM),0.65,0.1,0.1,0.153846,1.538462,0.035,1.063636
4,(BREAD),"(BISCUIT, MILK)",0.65,0.1,0.1,0.153846,1.538462,0.035,1.063636
5,(BREAD),"(TEA, BOURNVITA)",0.65,0.1,0.1,0.153846,1.538462,0.035,1.063636
6,(BREAD),"(MAGGI, JAM)",0.65,0.1,0.1,0.153846,1.538462,0.035,1.063636
7,(COFFEE),(CORNFLAKES),0.4,0.3,0.2,0.5,1.666667,0.08,1.4
8,(COFFEE),(SUGER),0.4,0.3,0.2,0.5,1.666667,0.08,1.4
9,(COFFEE),(COCK),0.4,0.15,0.15,0.375,2.5,0.09,1.36


# Result

We can see the item which is getting sold the most is in the "antecedents" column with its support value in the "antecedent support" column. So 1) BREAD 2) COFFEE 3) TEA are getting sold the most.
After getting the first item in the "antecedent"(with "antecedent support" frequency), in the "consequents" column we can see the item which are most probably will be sold together(with "confidence" value frequency)
"confidence" tells us how sure we are about selling the 2. item after selling the 1. item(that might(not) be the impact of first item , while "lift" tells how selling the 1. item affect the proabability of selling the 2. item.
So based on the result some strategical actions can be taken to increase selling items. Also new products can be tried and after analysing the results, the best ones can be replaced with the items which are not getting sold well.

# Bonus

We can also count the items and sort them based on the how frequently they have been sold as 2/3/4 items

In [11]:
df_freq['item_count'] = df_freq['itemsets'].apply(lambda x:len(x))

In [12]:
df_freq[(df_freq['item_count']==2) & (df_freq['support']>0.1)].sort_values(by='support',ascending=False)

Unnamed: 0,support,itemsets,item_count
14,0.2,"(BREAD, BISCUIT)",2
15,0.2,"(BREAD, MILK)",2
19,0.2,"(BREAD, TEA)",2
21,0.2,"(BREAD, SUGER)",2
30,0.2,"(MAGGI, TEA)",2
32,0.2,"(COFFEE, CORNFLAKES)",2
34,0.2,"(COFFEE, SUGER)",2
13,0.15,"(COCK, COFFEE)",2
16,0.15,"(BREAD, BOURNVITA)",2
17,0.15,"(MAGGI, BREAD)",2
