In [7]:
'''
What Is Association Rule Learning?
It identifies patterns like: “If a customer buys bread, they’re likely to buy butter.”

Common algorithms: Apriori, FP-Growth

Key metrics: Support, Confidence, Lift
'''
'''
Association Rule Learning:
Discovering relationships and dependencies between variables in large datasets.
Market Basket Analysis: Identifying frequently co-occurring items in transactional data 
                        (e.g., "customers who buy bread also tend to buy milk").
Recommendation Systems: Suggesting items or content to users based on their past behavior 
                        and the preferences of similar users.
'''

'\nAssociation Rule Learning:\nDiscovering relationships and dependencies between variables in large datasets.\nMarket Basket Analysis: Identifying frequently co-occurring items in transactional data \n                        (e.g., "customers who buy bread also tend to buy milk").\nRecommendation Systems: Suggesting items or content to users based on their past behavior \n                        and the preferences of similar users.\n'

In [1]:
# Step 1: Install required libraries
!pip install mlxtend

Defaulting to user installation because normal site-packages is not writeable
Collecting mlxtend
  Obtaining dependency information for mlxtend from https://files.pythonhosted.org/packages/4c/43/2fc7f76c8891aef148901f1ba3dee65c1cbac00a85ae5ee0dabc2b861256/mlxtend-0.23.4-py3-none-any.whl.metadata
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.4 MB 991.0 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.4 MB 2.5 MB/s eta 0:00:01
   ----------------- ---------------------- 0.6/1.4 MB 5.4 MB/s eta 0:00:01
   ----------------------- ---------------- 0.8/1.4 MB 6.1 MB/s eta 0:00:01
   ---------------------------------------  1.4/1.4 MB 7.1 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 7.1 MB/s et

In [23]:
# Step 2: Import libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [31]:
# Step 3: Load and preprocess data
#Dataset URL -- https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset/data
#https://www.kaggle.com/code/mohbabaqi/apriori-algorithm-a-simple-intuitive-guide --- Working code
df = pd.read_csv('D:\\Downloads\\Groceries_dataset\\Groceries_dataset.csv')
print(df.head())

   Member_number        Date   itemDescription
0           1808  21-07-2015    tropical fruit
1           2552  05-01-2015        whole milk
2           2300  19-09-2015         pip fruit
3           1187  12-12-2015  other vegetables
4           3037  01-02-2015        whole milk


In [32]:
# Aggregate items into lists per transaction
basket = (df
          .groupby(['Member_number','Date'])['itemDescription']
          .apply(list)
          .reset_index(name='Items'))

transactions = basket['Items'].tolist()
print("Total transactions:", len(transactions))
print("Sample transaction:", transactions[0])

'''
We create one list of items for each unique customer-date pair.

transactions is now a list of lists, ready for encoding.
'''

Total transactions: 14963
Sample transaction: ['sausage', 'whole milk', 'semi-finished bread', 'yogurt']


'\nWe create one list of items for each unique customer-date pair.\n\ntransactions is now a list of lists, ready for encoding.\n'

In [34]:
# Step 4: One-hot encode the items
#Apriori requires a Boolean (True/False) matrix of item presence.
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

print(df_encoded.shape)
print(df_encoded.iloc[0].head())
'''
Explanation:

Each column is an item; each row is a transaction.

True indicates the item was bought in that transaction.

'''

(14963, 167)
Instant food products    False
UHT-milk                 False
abrasive cleaner         False
artif. sweetener         False
baby cosmetics           False
Name: 0, dtype: bool


'\nExplanation:\n\nEach column is an item; each row is a transaction.\n\nTrue indicates the item was bought in that transaction.\n\n'

In [35]:
# Step 5: Apply Apriori algorithm
#Mining Frequent Itemsets with Apriori
from mlxtend.frequent_patterns import apriori

# Minimum support threshold: 1% of all transactions
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)
print(frequent_itemsets.sort_values('support', ascending=False).head(10))
'''
Explanation:

{whole milk} appears in ~15.8% of transactions.

We keep all itemsets whose support ≥1%

'''

     support            itemsets  length
62  0.157923        (whole milk)       1
40  0.122101  (other vegetables)       1
46  0.110005        (rolls/buns)       1
52  0.097106              (soda)       1
63  0.085879            (yogurt)       1
47  0.069572   (root vegetables)       1
57  0.067767    (tropical fruit)       1
5   0.060683     (bottled water)       1
49  0.060349           (sausage)       1
15  0.053131      (citrus fruit)       1


'\nExplanation:\n\n{whole milk} appears in ~15.8% of transactions.\n\nWe keep all itemsets whose support ≥1%\n\n'

In [38]:
# Step 6: Generate association rules
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets,
                          metric="confidence",
                          min_threshold=0.1)

# Sort and display top rules by lift
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules[['antecedents','consequents','support','confidence','lift']].head(10))

'''
Explanation of metrics:

Support: Fraction of transactions containing both antecedent and consequent.

Confidence: P(consequent | antecedent). E.g.,12.15% of baskets with other vegetables also have whole milk.

Lift: How much more likely the co-occurrence is versus random. Lift >1 means positive association.

'''

          antecedents   consequents   support  confidence      lift
3            (yogurt)  (whole milk)  0.011161    0.129961  0.822940
1        (rolls/buns)  (whole milk)  0.013968    0.126974  0.804028
0  (other vegetables)  (whole milk)  0.014837    0.121511  0.769430
2              (soda)  (whole milk)  0.011629    0.119752  0.758296


'\nExplanation of metrics:\n\nSupport: Fraction of transactions containing both antecedent and consequent.\n\nConfidence: P(consequent | antecedent). E.g.,14.8% of baskets with other vegetables also have whole milk.\n\nLift: How much more likely the co-occurrence is versus random. Lift >1 means positive association.\n\n'