In [1]:
# import necessary libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# load the dataset
df = pd.read_csv("breadbasket.csv")
df

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend
...,...,...,...,...,...
20502,9682,Coffee,09-04-2017 14:32,afternoon,weekend
20503,9682,Tea,09-04-2017 14:32,afternoon,weekend
20504,9683,Coffee,09-04-2017 14:57,afternoon,weekend
20505,9683,Pastry,09-04-2017 14:57,afternoon,weekend


In [3]:
# data preprocessing
df["Item"] = df["Item"].astype(str).str.strip()
df = df[df["Item"] != "NONE"]

In [None]:
# display dataset information
print("Lines:", df.shape)
df.head()

Lines: (20507, 5)


Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend


In [5]:
# prepare transactions for market basket analysis
transactions = (
    df.groupby("Transaction")["Item"]
      .apply(lambda x: list(set(x)))  
      .tolist()
)

print("Number of transactions:", len(transactions))
print("Example of transaction:", transactions[0])

Number of transactions: 9465
Example of transaction: ['Bread']


In [6]:
# one-hot encoding of transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

# create a DataFrame from the one-hot encoded array
df_hot = pd.DataFrame(te_ary, columns=te.columns_)
print("Shape one-hot:", df_hot.shape)

# analyze item popularity
print(df_hot.mean().sort_values(ascending=False).head(10))


Shape one-hot: (9465, 94)
Coffee           0.478394
Bread            0.327205
Tea              0.142631
Cake             0.103856
Pastry           0.086107
Sandwich         0.071844
Medialuna        0.061807
Hot chocolate    0.058320
Cookies          0.054411
Brownie          0.040042
dtype: float64


In [7]:
# mine frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(
    df_hot,
    min_support=0.01,   # 1%
    use_colnames=True
)

# display frequent itemsets information
print("Number of frequent itemsets:", len(frequent_itemsets))
frequent_itemsets.head()


Number of frequent itemsets: 61


Unnamed: 0,support,itemsets
0,0.036344,(Alfajores)
1,0.016059,(Baguette)
2,0.327205,(Bread)
3,0.040042,(Brownie)
4,0.103856,(Cake)


In [8]:
# generate association rules from frequent itemsets
rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=0.3
)

# sort rules by lift in descending order
rules = rules.sort_values("lift", ascending=False)

# display top 10 association rules
rules[["antecedents", "consequents", "support", "confidence", "lift"]].head(10)


Unnamed: 0,antecedents,consequents,support,confidence,lift
15,(Toast),(Coffee),0.023666,0.704403,1.472431
13,(Spanish Brunch),(Coffee),0.010882,0.598837,1.251766
7,(Medialuna),(Coffee),0.035182,0.569231,1.189878
9,(Pastry),(Coffee),0.047544,0.552147,1.154168
0,(Alfajores),(Coffee),0.019651,0.540698,1.130235
6,(Juice),(Coffee),0.020602,0.534247,1.11675
10,(Sandwich),(Coffee),0.038246,0.532353,1.112792
3,(Cake),(Coffee),0.054728,0.526958,1.101515
11,(Scone),(Coffee),0.018067,0.522936,1.093107
4,(Cookies),(Coffee),0.028209,0.518447,1.083723
