1. Import the provided groceries.csv dataset.

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

# Read the dataset
with open('groceries.csv', 'r') as file:
    transactions = [line.strip().split(',') for line in file]

# Convert to one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Show the first few rows
print(df.head())


         Instant food products  UHT-milk  abrasive cleaner  artif. sweetener  \
0  True                  False     False             False             False   
1  True                  False     False             False             False   
2  True                  False     False             False             False   
3  True                  False     False             False             False   
4  True                  False     False             False             False   

   baby cosmetics  baby food   bags  baking powder  bathroom cleaner  ...  \
0           False      False  False          False             False  ...   
1           False      False  False          False             False  ...   
2           False      False  False          False             False  ...   
3           False      False  False          False             False  ...   
4           False      False  False          False             False  ...   

   turkey  vinegar  waffles  whipped/sour cream  whisky 

 2. Explore the dataset and build the frequent-item DataFrame.

In [None]:
from mlxtend.frequent_patterns import apriori

# Explore the dataset
print(f"Number of transactions: {len(df)}")
print("Top 10 most frequent items:")
print(df.sum().sort_values(ascending=False).head(10))

# Build frequent itemsets with minimum support = 0.01 
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

# View top rows of frequent itemsets
print("\nFrequent itemsets (support > 1%):")
print(frequent_itemsets.head())


Number of transactions: 9835
Top 10 most frequent items:
                    9834
whole milk          2513
other vegetables    1903
rolls/buns          1809
soda                1715
yogurt              1372
bottled water       1087
root vegetables     1072
tropical fruit      1032
shopping bags        969
dtype: int64

Frequent itemsets (support > 1%):
    support         itemsets
0  0.999898               ()
1  0.033452       (UHT-milk)
2  0.017692  (baking powder)
3  0.052466           (beef)
4  0.033249        (berries)


3.	Apply the Apriori algorithm to find item sets with support > 8%.

In [5]:
from mlxtend.frequent_patterns import apriori

# Apply Apriori with minimum support of 8%
frequent_itemsets_08 = apriori(df, min_support=0.08, use_colnames=True)

# Display the results
print("Frequent itemsets with support > 8%:")
print(frequent_itemsets_08.sort_values(by='support', ascending=False))


Frequent itemsets with support > 8%:
     support              itemsets
0   0.999898                    ()
12  0.255516          (whole milk)
25  0.255414        (, whole milk)
4   0.193493    (other vegetables)
17  0.193391  (, other vegetables)
6   0.183935          (rolls/buns)
19  0.183833        (, rolls/buns)
10  0.174377                (soda)
23  0.174377              (, soda)
13  0.139502              (yogurt)
26  0.139400            (, yogurt)
2   0.110524       (bottled water)
15  0.110524     (, bottled water)
7   0.108998     (root vegetables)
20  0.108897   (, root vegetables)
11  0.104931      (tropical fruit)
24  0.104830    (, tropical fruit)
9   0.098526       (shopping bags)
22  0.098526     (, shopping bags)
8   0.093950             (sausage)
21  0.093849           (, sausage)
5   0.088968              (pastry)
18  0.088968            (, pastry)
3   0.082766        (citrus fruit)
16  0.082664      (, citrus fruit)
1   0.080529        (bottled beer)
14  0.080529      

4. Generate association rules using the lift metric.

In [22]:
from mlxtend.frequent_patterns import apriori, association_rules

# reduce the threshold
frequent_itemsets = apriori(df, min_support=0.03, use_colnames=True)

# Generate rules , confidence > 0.4
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)

# Drop meaningless association rules
rules = rules[(rules['antecedents'].apply(lambda x: len(x) > 0)) &
              (rules['consequents'].apply(lambda x: len(x) > 0))]

rules = rules[rules['lift'] > 1.2]

# Display rules
print("Association Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False))



Association Rules:
               antecedents           consequents   support  confidence  \
44       (root vegetables)    (other vegetables)  0.047382    0.434701   
52     (, root vegetables)    (other vegetables)  0.047280    0.434174   
54       (root vegetables)  (, other vegetables)  0.047280    0.433769   
47    (whipped/sour cream)          (whole milk)  0.032232    0.449645   
72  (, whipped/sour cream)          (whole milk)  0.032130    0.448864   
45       (root vegetables)          (whole milk)  0.048907    0.448694   
74    (whipped/sour cream)        (, whole milk)  0.032130    0.448227   
65     (, root vegetables)          (whole milk)  0.048805    0.448179   
67       (root vegetables)        (, whole milk)  0.048805    0.447761   
46        (tropical fruit)          (whole milk)  0.042298    0.403101   
69      (, tropical fruit)          (whole milk)  0.042196    0.402522   
71        (tropical fruit)        (, whole milk)  0.042196    0.402132   
48                (

 6. How many rules satisfy both lift > 4 and confidence > 0.8?

In [None]:
# Filter rules
strong_rules = rules[(rules['lift'] > 4) & (rules['confidence'] > 0.8)]

# Display the count 
print(f"Number of rules with lift > 4 and confidence > 0.8: {len(strong_rules)}")
print(strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])



Number of rules with lift > 4 and confidence > 0.8: 0
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []
