# -------------- Association Rules ----------------

In [29]:
# Import necessary libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import numpy as np

## --- 1. DATA LOADING ---

In [30]:
df=pd.read_excel('Online retail.xlsx',sheet_name='Sheet1',header=None)
df.head()

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


## --- 2. DATA PREPROCESSING (Cleaning and Formatting) ---

In [31]:
df.shape

(7501, 1)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7501 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


In [33]:
# Extract the single column (which contains the entire transaction string)
transactions_series = df.iloc[:, 0].astype(str)

In [34]:
# Convert each transaction string into a list of individual items
# 1. Strip quotes and leading/trailing spaces
# 2. Split the string by the comma (',')
# 3. Filter out any empty strings that may result from extra spaces
transactions = []
for transaction_str in transactions_series:
    # Handle both double quotes and standard CSV formatting
    cleaned_str = transaction_str.strip('\"').strip()
    items = [item.strip() for item in cleaned_str.split(',') if item.strip()]
    if items:
        transactions.append(items)

print("First 3 Parsed Transactions:")
print(transactions[:3])

First 3 Parsed Transactions:
[['shrimp', 'almonds', 'avocado', 'vegetables mix', 'green grapes', 'whole weat flour', 'yams', 'cottage cheese', 'energy drink', 'tomato juice', 'low fat yogurt', 'green tea', 'honey', 'salad', 'mineral water', 'salmon', 'antioxydant juice', 'frozen smoothie', 'spinach', 'olive oil'], ['burgers', 'meatballs', 'eggs'], ['chutney']]


In [35]:
## Convert the list of lists (transactions) into the required DataFrame format (One-Hot Encoded)
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
basket_sets = pd.DataFrame(te_ary, columns=te.columns_)

print("\nOne-Hot Encoded Basket Head:")
print(basket_sets.head())


One-Hot Encoded Basket Head:
   almonds  antioxydant juice  asparagus  avocado  babies food  bacon  \
0     True               True      False     True        False  False   
1    False              False      False    False        False  False   
2    False              False      False    False        False  False   
3    False              False      False     True        False  False   
4    False              False      False    False        False  False   

   barbecue sauce  black tea  blueberries  body spray  ...  turkey  \
0           False      False        False       False  ...   False   
1           False      False        False       False  ...   False   
2           False      False        False       False  ...   False   
3           False      False        False       False  ...    True   
4           False      False        False       False  ...   False   

   vegetables mix  water spray  white wine  whole weat flour  \
0            True        False       False    

## --- 3. Association Rule Mining (Apriori) ---

In [36]:
## 3.1 Run Apriori Algorithm to Find Frequent Itemsets
# Start with a suitable minimum support (e.g., 0.01 or 1% of transactions)
min_support = 0.01
frequent_itemsets = apriori(basket_sets, min_support=min_support, use_colnames=True)

print(f"\nFrequent Itemsets (min_support={min_support}, Top 10):")
print(frequent_itemsets.head(10))


Frequent Itemsets (min_support=0.01, Top 10):
    support          itemsets
0  0.020397         (almonds)
1  0.033329         (avocado)
2  0.010799  (barbecue sauce)
3  0.014265       (black tea)
4  0.011465      (body spray)
5  0.033729        (brownies)
6  0.087188         (burgers)
7  0.030129          (butter)
8  0.081056            (cake)
9  0.015331         (carrots)


In [38]:
## 3.2 Generate Association Rules
# Generate rules using lift as the primary metric, requiring it to be > 1.
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

## --- 4. Analysis and Interpretation ---

In [39]:
## 4.1 Set Appropriate Thresholds for Meaningful Rules

# Filter for rules that are strong (high confidence) AND interesting (high lift)
meaningful_rules = rules[ (rules['lift'] >= 1.2) &
                          (rules['confidence'] >= 0.6) ]

# Sort the rules by Lift (descending) to find the strongest relationships.
meaningful_rules = meaningful_rules.sort_values(by='lift', ascending=False).reset_index(drop=True)

print("\nMeaningful Rules (Lift >= 1.2, Confidence >= 0.6):")
print(meaningful_rules.head(10))



Meaningful Rules (Lift >= 1.2, Confidence >= 0.6):
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [40]:
## 4.2 Interpretation and Insights

if not meaningful_rules.empty:
    top_rule = meaningful_rules.iloc[0]
    antecedents = set(top_rule['antecedents'])
    consequents = set(top_rule['consequents'])
    confidence = top_rule['confidence'] * 100
    lift = top_rule['lift']

    print(f"\n--- Customer Purchasing Insight (Top Rule) ---")
    print(f"Rule: {antecedents} -> {consequents}")
    print(f"Confidence: {confidence:.2f}% | Lift: {lift:.2f}")

    print(f"\nInsight: Customers who purchase {antecedents} have a **{confidence:.2f}%** chance of also buying {consequents}. The **Lift of {lift:.2f}** shows this co-occurrence is **{lift:.2f} times** more frequent than pure chance, indicating a strong complementary or habitual purchase pattern. This insight is useful for optimizing store layout or creating bundled promotions.")
else:
    print("\nNo rules met the stringent thresholds (Lift >= 1.2 and Confidence >= 0.6). Try lowering the minimum support or minimum confidence to extract more rules.")



No rules met the stringent thresholds (Lift >= 1.2 and Confidence >= 0.6). Try lowering the minimum support or minimum confidence to extract more rules.


# Interview Questions

## 1. What is Lift and why is it important in Association rules?
Lift measures how much more likely the antecedent (item A) and consequent (item B) are to be purchased together than would be expected if they were independent purchases.

Lift= 
Support (B)
Confidence (A→B)
​
 = 
P(A)×P(B)
P(A∩B)
​
 
**Importance of Lift:**
**Identifies True Relationships:** Lift helps differentiate between truly related items and items that are simply popular. A rule might have high support and confidence just because the consequent (B) is bought frequently by everyone, regardless of whether the antecedent (A) is present.

**Measures Strength:**

Lift = 1: The occurrence of A and B is independent. The rule is no better than chance.

Lift > 1: A and B are positively correlated (bought together more often than expected). This indicates a useful rule for cross-selling.

Lift < 1: A and B are negatively correlated (one discourages the purchase of the other).



## 2. What are Support and Confidence? How do you calculate them?
Support
Support is an indication of how frequently the itemset appears in the dataset. It is the proportion of total transactions that contain the specified itemset.

**Calculation:**

Support (A)= 
Total number of transactions
Number of transactions containing A
​
 
$$$$For an itemset {A,B}:

Support (A∪B)= 
Total number of transactions
Number of transactions containing both A and B
​
 
$$$$

**Role:** Used in the Apriori algorithm to filter out infrequent itemsets early on, saving computation time.

**Confidence**
Confidence is a measure of the reliability of the rule. It is the conditional probability that the consequent (B) will be purchased given that the antecedent (A) has been purchased.

**Calculation:**

Confidence (A→B)= 
Support (A)
Support (A∪B)
​
 
$$$$

**Role:** Indicates the strength of the association in one direction (A→B). A high confidence suggests that the rule is likely to hold true in future transactions.

## 3. What are some limitations or challenges of Association rules mining?
The primary limitations and challenges of Association Rule Mining, particularly with the Apriori algorithm, fall into three main categories: scalability, threshold sensitivity, and data restriction. The biggest hurdle is computational scalability, known as the combinatorial explosion problem. As the number of unique items (N) in a dataset increases (common in large retail), the number of potential itemsets grows exponentially (2 
N
 ). This makes finding all frequent itemsets extremely time- and memory-intensive. Second, the quality of the rules is highly sensitive to the arbitrary selection of thresholds for minimum Support and Confidence. A threshold that is set too low generates a massive number of rules, many of which are trivial or obvious (e.g., "If someone buys a steering wheel, they buy a car"), burying the useful insights. Conversely, a threshold set too high might filter out rare but potentially highly profitable relationships. Finally, the technique is fundamentally designed for binary data (an item is present or absent in a transaction). Handling quantitative information (e.g., the specific quantity or price of an item) requires additional, complex preprocessing steps like discretization, which can lead to a loss of valuable data and context.