In [45]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prettytable import PrettyTable
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth

In [5]:
data = pd.read_csv("Market_Basket_Optimisation.csv")
print(data.shape)
data.head()

(7500, 20)


Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


In [24]:
data = [[item for item in row if item is not np.nan][:-1] for row in data.values]

In [30]:
# The following instructions transform the dataset into the required format 
trans_encoder = TransactionEncoder() # Instanciate the encoder
trans_encoder_matrix = trans_encoder.fit_transform(data)
trans_encoder_matrix = pd.DataFrame(trans_encoder_matrix, columns=trans_encoder.columns_)

In [31]:
trans_encoder_matrix.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
## Apriori

min_support = 0.01
start_time = time.time()
rule_items = apriori(trans_encoder_matrix, 
               min_support=min_support, 
               use_colnames=True)
total_execution = time.time() - start_time
print(f"Computed Apriori, in time {total_execution} s")
rule_items

Computed Apriori, in time 0.2642984390258789 s


Unnamed: 0,support,itemsets
0,0.020267,(almonds)
1,0.033200,(avocado)
2,0.010800,(barbecue sauce)
3,0.014267,(black tea)
4,0.011467,(body spray)
...,...,...
254,0.011067,"(ground beef, milk, mineral water)"
255,0.017067,"(ground beef, mineral water, spaghetti)"
256,0.015733,"(milk, mineral water, spaghetti)"
257,0.010267,"(spaghetti, mineral water, olive oil)"


In [43]:
## FP growth
start_time = time.time()
rule_items = fpgrowth(trans_encoder_matrix, 
               min_support=min_support, 
               use_colnames=True)
total_execution = time.time() - start_time
print(f"Computed Fp Growth, in time {total_execution} s")
rule_items

Computed Fp Growth, in time 0.17557191848754883 s


Unnamed: 0,support,itemsets
0,0.179733,(eggs)
1,0.087200,(burgers)
2,0.020933,(meatballs)
3,0.062533,(turkey)
4,0.033200,(avocado)
...,...,...
254,0.014133,"(ground beef, olive oil)"
255,0.011333,"(frozen vegetables, olive oil)"
256,0.012000,"(eggs, olive oil)"
257,0.010800,"(pancakes, olive oil)"


In [49]:
rules = association_rules(rule_items, 
                          metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(eggs),(mineral water),0.179733,0.238267,0.050933,0.283383,1.189351,0.008109,1.062957,0.194090
1,(mineral water),(eggs),0.238267,0.179733,0.050933,0.213766,1.189351,0.008109,1.043286,0.209004
2,(eggs),(burgers),0.179733,0.087200,0.028800,0.160237,1.837585,0.013127,1.086974,0.555682
3,(burgers),(eggs),0.087200,0.179733,0.028800,0.330275,1.837585,0.013127,1.224782,0.499351
4,(mineral water),(burgers),0.238267,0.087200,0.024400,0.102406,1.174384,0.003623,1.016941,0.194936
...,...,...,...,...,...,...,...,...,...,...
403,"(spaghetti, olive oil)",(mineral water),0.022933,0.238267,0.010267,0.447674,1.878880,0.004802,1.379138,0.478747
404,"(mineral water, olive oil)",(spaghetti),0.027467,0.174133,0.010267,0.373786,2.146553,0.005484,1.318826,0.549222
405,(spaghetti),"(mineral water, olive oil)",0.174133,0.027467,0.010267,0.058959,2.146553,0.005484,1.033465,0.646759
406,(mineral water),"(spaghetti, olive oil)",0.238267,0.022933,0.010267,0.043089,1.878880,0.004802,1.021063,0.614084


In [74]:
df = pd.DataFrame([[1, 2, pd.NA, pd.NA], 
                   [1, 3, pd.NA, pd.NA], 
                   [2, 3, 4, 5],
                   [1, 4, 5, pd.NA]], columns=['item1','item2','item3','item4'])
df.head()

Unnamed: 0,item1,item2,item3,item4
0,1,2,,
1,1,3,,
2,2,3,4.0,5.0
3,1,4,5.0,


In [75]:
#dataset = [[int(item) for item in row if not np.isnan(item)] for row in df.values]
dataset = [[item for item in row if item is not pd.NA] for row in df.values]
dataset

[[1, 2], [1, 3], [2, 3, 4, 5], [1, 4, 5]]

In [64]:
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.5, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.75,(1)
1,0.75,(2)
2,0.5,(3)
3,0.5,(4)
4,0.5,(5)
5,0.5,"(1, 2)"
6,0.5,"(2, 3)"
7,0.5,"(4, 5)"
