In [6]:
import pandas as pd

df = pd.read_excel('Online retail.xlsx', header=None)

df.columns = ['Items'] # Column rename
df.dropna(inplace=True) # # Drop blank transactions
transactions = df['Items'].apply(lambda x: x.strip().split(',')) # Convert comma-separated items into list
transactions.head()

Unnamed: 0,Items
0,"[shrimp, almonds, avocado, vegetables mix, gre..."
1,"[burgers, meatballs, eggs]"
2,[chutney]
3,"[turkey, avocado]"
4,"[mineral water, milk, energy bar, whole wheat ..."


# **Transaction Encoding (One-Hot)**

In [7]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions) # Fit & transform the transaction data
df_encoded = pd.DataFrame(te_array, columns=te.columns_) # Convert to DataFrame
df_encoded.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


# **Find Frequent Itemsets**

In [9]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True) # Apply Apriori algorithm to find frequent itemsets
frequent_itemsets.sort_values(by='support', ascending=False).head(10) # Sort by support descending

Unnamed: 0,support,itemsets
46,0.238368,(mineral water)
19,0.179709,(eggs)
63,0.17411,(spaghetti)
24,0.170911,(french fries)
13,0.163845,(chocolate)
32,0.132116,(green tea)
45,0.129583,(milk)
33,0.098254,(ground beef)
30,0.095321,(frozen vegetables)
53,0.095054,(pancakes)


# **Generate Association Rules**

In [12]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0) # Generate rules using confidence & lift
rules.sort_values(by='lift', ascending=False).head(10) # Sort rules by lift

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
214,(herb & pepper),(ground beef),0.04946,0.098254,0.015998,0.32345,3.291994,1.0,0.011138,1.33286,0.73246,0.121457,0.249734,0.243136
215,(ground beef),(herb & pepper),0.098254,0.04946,0.015998,0.162822,3.291994,1.0,0.011138,1.13541,0.772094,0.121457,0.119261,0.243136
385,(ground beef),"(spaghetti, mineral water)",0.098254,0.059725,0.017064,0.173677,2.907928,1.0,0.011196,1.137902,0.727602,0.121097,0.12119,0.229696
384,"(spaghetti, mineral water)",(ground beef),0.059725,0.098254,0.017064,0.285714,2.907928,1.0,0.011196,1.262445,0.697788,0.121097,0.207886,0.229696
397,(olive oil),"(spaghetti, mineral water)",0.065858,0.059725,0.010265,0.15587,2.609786,1.0,0.006332,1.113898,0.660314,0.089017,0.102252,0.163873
396,"(spaghetti, mineral water)",(olive oil),0.059725,0.065858,0.010265,0.171875,2.609786,1.0,0.006332,1.128021,0.656007,0.089017,0.113491,0.163873
193,(tomatoes),(frozen vegetables),0.068391,0.095321,0.016131,0.235867,2.474464,1.0,0.009612,1.18393,0.639616,0.109304,0.155355,0.202549
192,(frozen vegetables),(tomatoes),0.095321,0.068391,0.016131,0.169231,2.474464,1.0,0.009612,1.121381,0.658656,0.109304,0.108243,0.202549
188,(shrimp),(frozen vegetables),0.071457,0.095321,0.016664,0.233209,2.446574,1.0,0.009853,1.179825,0.636767,0.111012,0.152417,0.204017
189,(frozen vegetables),(shrimp),0.095321,0.071457,0.016664,0.174825,2.446574,1.0,0.009853,1.125268,0.653563,0.111012,0.111323,0.204017


# **Analyze Top Rules**

In [11]:
# Display key columns
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(
    by='lift', ascending=False
).head(10)

Unnamed: 0,antecedents,consequents,support,confidence,lift
214,(herb & pepper),(ground beef),0.015998,0.32345,3.291994
215,(ground beef),(herb & pepper),0.015998,0.162822,3.291994
385,(ground beef),"(spaghetti, mineral water)",0.017064,0.173677,2.907928
384,"(spaghetti, mineral water)",(ground beef),0.017064,0.285714,2.907928
397,(olive oil),"(spaghetti, mineral water)",0.010265,0.15587,2.609786
396,"(spaghetti, mineral water)",(olive oil),0.010265,0.171875,2.609786
193,(tomatoes),(frozen vegetables),0.016131,0.235867,2.474464
192,(frozen vegetables),(tomatoes),0.016131,0.169231,2.474464
188,(shrimp),(frozen vegetables),0.016664,0.233209,2.446574
189,(frozen vegetables),(shrimp),0.016664,0.174825,2.446574


## **Interview Questions: Association Rule Mining**


### 1. What is Lift and Why is it Important in Association Rules?

**Lift** measures how much more likely items A and B are bought together than if they were independent.

- **Lift > 1** → Positive association  
- **Lift = 1** → No association  
- **Lift < 1** → Negative association

> It's important because it helps us identify rules that are not just frequent but truly **interesting and non-random**.



### 2. What is Support and Confidence? How do You Calculate Them?

**Support**:

\[
\text{support}(A → B) = \frac{\text{Transactions containing both A and B}}{\text{Total transactions}}
\]

**Confidence**:

\[
\text{confidence}(A → B) = \frac{\text{Transactions containing both A and B}}{\text{Transactions containing A}}
\]

- **Support** shows how frequent an itemset occurs in the dataset.
- **Confidence** indicates the likelihood of buying B given that A is bought.
italicized text

### 3. What are Some Limitations or Challenges of Association Rule Mining?

- Generates **too many rules**, making it difficult to filter meaningful ones.
- Requires **manual tuning** for support, confidence, and lift thresholds.
- Doesn’t consider the **order or timing** of purchases.
- Can be **computationally expensive** on large datasets.
- **Ignores contextual factors** like customer demographics, time, and seasonality.
