In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('dark_background')
plt.rcParams['axes.prop_cycle'] = plt.cycler(color = sns.color_palette('dark'))
plt.rcParams['figure.figsize'] = (20 , 20)

In [5]:
#load dataset:
df = pd.read_csv('./data/groceries.main.csv')
df.head()

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


In [7]:
#build transactions:
transactions = []
for i , row in df.iterrows():
    basket = [item for item in row[1:] if pd.notna(item)]
    transactions.append(basket)
print("Example basket 0:", transactions[0])
print("Total baskets  :", len(transactions))
print(f"{len(transactions)} loaded with {len(set(sum(transactions , [])))} unique items.")

Example basket 0: ['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups']
Total baskets  : 9835
9835 loaded with 169 unique items.


In [10]:
bsizes = [len(basket) for basket in transactions]
pd.Series(bsizes).describe()

count    9835.000000
mean        4.409456
std         3.589385
min         1.000000
25%         2.000000
50%         3.000000
75%         6.000000
max        32.000000
dtype: float64

In [14]:
# top 15 items by sale:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori , association_rules

In [18]:
te = TransactionEncoder()
imat = te.fit_transform(transactions)
idf = pd.DataFrame(imat , columns = te.columns_)
idf.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [26]:
(np.sum(idf).sort_values(ascending = False) / len(idf)).round(3)

whole milk               0.256
other vegetables         0.193
rolls/buns               0.184
soda                     0.174
yogurt                   0.140
                         ...  
bags                     0.000
kitchen utensil          0.000
preservation products    0.000
baby food                0.000
sound storage medium     0.000
Length: 169, dtype: float64

In [32]:
fsets = apriori(idf , min_support= 0.02 , use_colnames= True , max_len = 3 , verbose = 1)
fsets['length'] = fsets.itemsets.str.len()
print(f"{len(fsets)} frequent itemsets >= 2% support")
fsets

Processing 630 combinations | Sampling itemset size 32
122 frequent itemsets >= 2% support


Unnamed: 0,support,itemsets,length
0,0.033452,(UHT-milk),1
1,0.052466,(beef),1
2,0.033249,(berries),1
3,0.026029,(beverages),1
4,0.080529,(bottled beer),1
...,...,...,...
117,0.032232,"(whipped/sour cream, whole milk)",2
118,0.020742,"(whipped/sour cream, yogurt)",2
119,0.056024,"(yogurt, whole milk)",2
120,0.023183,"(root vegetables, whole milk, other vegetables)",3


In [35]:
#high confidence association rules:
rulec = association_rules(fsets , metric = 'confidence' , min_threshold= 0.4).sort_values('confidence' , ascending = False)
rulec

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
14,"(yogurt, other vegetables)",(whole milk),0.043416,0.255516,0.022267,0.512881,2.007235,1.0,0.011174,1.52834,0.524577,0.080485,0.345695,0.300014
1,(butter),(whole milk),0.055414,0.255516,0.027555,0.497248,1.946053,1.0,0.013395,1.480817,0.514659,0.097237,0.324697,0.302543
2,(curd),(whole milk),0.053279,0.255516,0.026131,0.490458,1.919481,1.0,0.012517,1.461085,0.505984,0.092446,0.315577,0.296363
13,"(root vegetables, other vegetables)",(whole milk),0.047382,0.255516,0.023183,0.48927,1.914833,1.0,0.011076,1.457687,0.501524,0.082879,0.313982,0.289999
12,"(root vegetables, whole milk)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,1.0,0.013719,1.53332,0.62223,0.105751,0.347821,0.296912
3,(domestic eggs),(whole milk),0.063447,0.255516,0.029995,0.472756,1.850203,1.0,0.013783,1.41203,0.490649,0.1038,0.2918,0.295073
10,(whipped/sour cream),(whole milk),0.071683,0.255516,0.032232,0.449645,1.759754,1.0,0.013916,1.352735,0.465077,0.109273,0.260757,0.287895
8,(root vegetables),(whole milk),0.108998,0.255516,0.048907,0.448694,1.756031,1.0,0.021056,1.350401,0.483202,0.154961,0.259479,0.320049
6,(root vegetables),(other vegetables),0.108998,0.193493,0.047382,0.434701,2.246605,1.0,0.026291,1.426693,0.622764,0.185731,0.299078,0.339789
4,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,1.0,0.008149,1.294636,0.418855,0.072172,0.227582,0.252466


In [36]:
rulel = association_rules(fsets , metric = 'lift' , min_threshold = 1.5).sort_values('lift' , ascending = False)
rulel

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
67,(root vegetables),"(whole milk, other vegetables)",0.108998,0.074835,0.023183,0.212687,2.842082,1.0,0.015026,1.175091,0.727435,0.144304,0.149002,0.261235
66,"(whole milk, other vegetables)",(root vegetables),0.074835,0.108998,0.023183,0.309783,2.842082,1.0,0.015026,1.290900,0.700572,0.144304,0.225347,0.261235
41,(tropical fruit),(pip fruit),0.104931,0.075648,0.020437,0.194767,2.574648,1.0,0.012499,1.147931,0.683297,0.127619,0.128868,0.232464
40,(pip fruit),(tropical fruit),0.075648,0.104931,0.020437,0.270161,2.574648,1.0,0.012499,1.226392,0.661650,0.127619,0.184600,0.232464
64,"(root vegetables, whole milk)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.449770,1.0,0.013719,1.533320,0.622230,0.105751,0.347821,0.296912
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,(pork),(whole milk),0.057651,0.255516,0.022166,0.384480,1.504719,1.0,0.007435,1.209520,0.355945,0.076171,0.173226,0.235614
23,(fruit/vegetable juice),(other vegetables),0.072293,0.193493,0.021047,0.291139,1.504653,1.0,0.007059,1.137751,0.361531,0.085999,0.121073,0.199957
22,(other vegetables),(fruit/vegetable juice),0.193493,0.072293,0.021047,0.108776,1.504653,1.0,0.007059,1.040936,0.415861,0.085999,0.039326,0.199957
3,(soda),(bottled water),0.174377,0.110524,0.028978,0.166181,1.503577,1.0,0.009705,1.066750,0.405656,0.113230,0.062573,0.214185


In [38]:
#top 5 business ready(comprision of joint conditions):
candidateRules = association_rules(fsets , metric = 'confidence' , min_threshold= 0.40)
iitems = candidateRules[(candidateRules['support'] >= 0.03) & (candidateRules['lift'] >= 1.6)].sort_values(by = ['confidence' , 'lift'] , ascending = False)
iitems.head()[['antecedents' , 'consequents' , 'support' , 'confidence' , 'lift']].reset_index(drop = True)

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(whipped/sour cream),(whole milk),0.032232,0.449645,1.759754
1,(root vegetables),(whole milk),0.048907,0.448694,1.756031
2,(root vegetables),(other vegetables),0.047382,0.434701,2.246605
