In [2]:
import pandas as pd
from csv import reader
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules



In [3]:
# Step 1: Read and process the data
with open('groceries.csv', 'r') as f:
    transactions = [line.strip().split(',') for line in f.readlines()]

# Step 2: Convert to one-hot encoded dataframe
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [4]:
df.head(10)


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
5,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [5]:
frequent_itemsets = apriori(df, min_support=0.02, use_colnames=True)

### 122 itemsets occurs atleast in 2% of transactions

In [6]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.033452,(UHT-milk)
1,0.052466,(beef)
2,0.033249,(berries)
3,0.026029,(beverages)
4,0.080529,(bottled beer)
...,...,...
117,0.032232,"(whipped/sour cream, whole milk)"
118,0.020742,"(whipped/sour cream, yogurt)"
119,0.056024,"(whole milk, yogurt)"
120,0.023183,"(other vegetables, root vegetables, whole milk)"


In [7]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.033452,(UHT-milk),1
1,0.052466,(beef),1
2,0.033249,(berries),1
3,0.026029,(beverages),1
4,0.080529,(bottled beer),1
...,...,...,...
117,0.032232,"(whipped/sour cream, whole milk)",2
118,0.020742,"(whipped/sour cream, yogurt)",2
119,0.056024,"(whole milk, yogurt)",2
120,0.023183,"(other vegetables, root vegetables, whole milk)",3


### Top 3 items with atleast 2% support

In [8]:
frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.02) ][0:3]

Unnamed: 0,support,itemsets,length
0,0.033452,(UHT-milk),1
1,0.052466,(beef),1
2,0.033249,(berries),1


### All frequent itemsets with minimum support of 4% support

In [9]:
frequent_itemsets[(frequent_itemsets['length'] > 1) & 
                  (frequent_itemsets['support'] >= 0.04)]

Unnamed: 0,support,itemsets,length
84,0.042603,"(other vegetables, rolls/buns)",2
85,0.047382,"(other vegetables, root vegetables)",2
91,0.074835,"(other vegetables, whole milk)",2
92,0.043416,"(other vegetables, yogurt)",2
103,0.056634,"(whole milk, rolls/buns)",2
106,0.048907,"(root vegetables, whole milk)",2
113,0.040061,"(whole milk, soda)",2
115,0.042298,"(tropical fruit, whole milk)",2
119,0.056024,"(whole milk, yogurt)",2


### All frequent itemsets of length 2 with minimum support of 2%

In [10]:
frequent_itemsets[(frequent_itemsets['length'] == 2) & 
                  (frequent_itemsets['support'] >= 0.04)]

Unnamed: 0,support,itemsets,length
84,0.042603,"(other vegetables, rolls/buns)",2
85,0.047382,"(other vegetables, root vegetables)",2
91,0.074835,"(other vegetables, whole milk)",2
92,0.043416,"(other vegetables, yogurt)",2
103,0.056634,"(whole milk, rolls/buns)",2
106,0.048907,"(root vegetables, whole milk)",2
113,0.040061,"(whole milk, soda)",2
115,0.042298,"(tropical fruit, whole milk)",2
119,0.056024,"(whole milk, yogurt)",2


### Finding top 10 association rules with minimum support of 2%

In [11]:
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.02)

print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                    antecedents                     consequents   support  \
0                        (beef)                    (whole milk)  0.021251   
1                  (whole milk)                          (beef)  0.021251   
2                (bottled beer)                    (whole milk)  0.020437   
3                  (whole milk)                  (bottled beer)  0.020437   
4               (bottled water)              (other vegetables)  0.024809   
..                          ...                             ...       ...   
129  (other vegetables, yogurt)                    (whole milk)  0.022267   
130        (whole milk, yogurt)              (other vegetables)  0.022267   
131          (other vegetables)            (whole milk, yogurt)  0.022267   
132                (whole milk)      (other vegetables, yogurt)  0.022267   
133                    (yogurt)  (other vegetables, whole milk)  0.022267   

     confidence      lift  
0      0.405039  1.585180  
1      0.083168  1.

### Finding top 10 association rules with minimum support of 2% and having lift more than 1
#### This means those items occurs atleast in 2% of transactions and having good likelihood of being bought together

In [12]:

rules[(rules['support'] >= 0.02) &
      (rules['lift'] > 1.0)][0:10]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(beef),(whole milk),0.052466,0.255516,0.021251,0.405039,1.58518,1.0,0.007845,1.251315,0.389597,0.074113,0.200841,0.244103
1,(whole milk),(beef),0.255516,0.052466,0.021251,0.083168,1.58518,1.0,0.007845,1.033487,0.495856,0.074113,0.032402,0.244103
4,(bottled water),(other vegetables),0.110524,0.193493,0.024809,0.224471,1.160101,1.0,0.003424,1.039945,0.155154,0.088857,0.038411,0.176345
5,(other vegetables),(bottled water),0.193493,0.110524,0.024809,0.128219,1.160101,1.0,0.003424,1.020297,0.171116,0.088857,0.019894,0.176345
6,(bottled water),(rolls/buns),0.110524,0.183935,0.024199,0.218951,1.190373,1.0,0.00387,1.044832,0.1798,0.089541,0.042909,0.175258
7,(rolls/buns),(bottled water),0.183935,0.110524,0.024199,0.131564,1.190373,1.0,0.00387,1.024228,0.195974,0.089541,0.023655,0.175258
8,(bottled water),(soda),0.110524,0.174377,0.028978,0.26219,1.503577,1.0,0.009705,1.119017,0.376535,0.11323,0.106359,0.214185
9,(soda),(bottled water),0.174377,0.110524,0.028978,0.166181,1.503577,1.0,0.009705,1.06675,0.405656,0.11323,0.062573,0.214185
10,(bottled water),(whole milk),0.110524,0.255516,0.034367,0.310948,1.21694,1.0,0.006126,1.080446,0.200417,0.103617,0.074456,0.222724
11,(whole milk),(bottled water),0.255516,0.110524,0.034367,0.134501,1.21694,1.0,0.006126,1.027703,0.23945,0.103617,0.026956,0.222724


### Finding items with high likelihood of being bought together

In [13]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

                    antecedents                     consequents   support  \
0                        (beef)                    (whole milk)  0.021251   
1                  (whole milk)                          (beef)  0.021251   
2               (bottled water)              (other vegetables)  0.024809   
3            (other vegetables)                 (bottled water)  0.024809   
4               (bottled water)                    (rolls/buns)  0.024199   
..                          ...                             ...       ...   
121  (other vegetables, yogurt)                    (whole milk)  0.022267   
122        (whole milk, yogurt)              (other vegetables)  0.022267   
123          (other vegetables)            (whole milk, yogurt)  0.022267   
124                (whole milk)      (other vegetables, yogurt)  0.022267   
125                    (yogurt)  (other vegetables, whole milk)  0.022267   

     confidence      lift  
0      0.405039  1.585180  
1      0.083168  1.

### With confidence 10% to 30% 

### 11% of people bought bottled water
### 19.3% of people bought other vegetables
### Around 2.5% of customers buy both bottled water and other vegetables in the same transaction

In [16]:
rules[(rules['confidence']>=0.10) & (rules['confidence']<=0.30)][0:1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2,(bottled water),(other vegetables),0.110524,0.193493,0.024809,0.224471,1.160101,1.0,0.003424,1.039945,0.155154,0.088857,0.038411,0.176345
