In [1]:
import pandas as pd
file_path = "D:\\Data Science\\assignments\\Online retail.xlsx"
df = pd.read_excel(file_path, sheet_name=0)
df

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"


In [2]:
print("Shape:", df.shape)

Shape: (7500, 1)


In [3]:
print("Columns:", df.columns.tolist())

Columns: ['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil']


In [4]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                           Non-Null Count  Dtype 
---  ------                                                                                                                                                                                                                           --------------  ----- 
 0   shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil  7500 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


None

In [5]:
# Data Preprocessing:

# Step 1: Rename column
df.columns = ['Transactions']

In [6]:
# Step 2: Convert transaction strings to lists of items 
transactions = df['Transactions'].apply(lambda x: [item.strip() for item in x.split(',')])
transactions

0                              [burgers, meatballs, eggs]
1                                               [chutney]
2                                       [turkey, avocado]
3       [mineral water, milk, energy bar, whole wheat ...
4                                        [low fat yogurt]
                              ...                        
7495                    [butter, light mayo, fresh bread]
7496    [burgers, frozen vegetables, eggs, french frie...
7497                                            [chicken]
7498                                [escalope, green tea]
7499    [eggs, frozen smoothie, yogurt cake, low fat y...
Name: Transactions, Length: 7500, dtype: object

In [7]:
# Step 3: Remove duplicates (if any)
transactions = transactions.drop_duplicates().tolist()

In [11]:
# Step 4: One-hot encode the data
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [12]:
df_encoded.head()

Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# Association Rule Mining:

from mlxtend.frequent_patterns import apriori, association_rules

# Apply Apriori algorithm
# Set a minimum support threshold (e.g., 0.03 = item appears in at least 3% of transactions)
frequent_itemsets = apriori(df_encoded, min_support=0.03, use_colnames=True)

# Sort by support
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print("Frequent Itemsets (Top 10):")
print(frequent_itemsets.head(10))

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Sort rules by lift and confidence
rules = rules.sort_values(by=["lift", "confidence"], ascending=[False, False])
print("\nAssociation Rules (Top 10):")
print(rules.head(10))


Frequent Itemsets (Top 10):
     support             itemsets
32  0.299710      (mineral water)
41  0.229565          (spaghetti)
12  0.208116               (eggs)
8   0.205217          (chocolate)
16  0.192657       (french fries)
31  0.170048               (milk)
23  0.169082          (green tea)
24  0.135845        (ground beef)
21  0.129855  (frozen vegetables)
35  0.125217           (pancakes)

Association Rules (Top 10):
            antecedents          consequents  antecedent support  \
11        (ground beef)          (spaghetti)            0.135845   
10          (spaghetti)        (ground beef)            0.229565   
65          (olive oil)          (spaghetti)            0.087536   
64          (spaghetti)          (olive oil)            0.229565   
53               (soup)      (mineral water)            0.070918   
52      (mineral water)               (soup)            0.299710   
49  (frozen vegetables)               (milk)            0.129855   
48               (milk)  

In [None]:
"""
Analysis and Interpretation:

1. Overview

-> Using the Apriori algorithm, we mined association rules from the Online Retail dataset to find relationships among 
   frequently purchased items.
-> The analysis helps uncover patterns in customer purchase behavior — such as which items are often bought together — 
   useful for marketing, product placement, and recommendation systems.

2. Observations from Frequent Itemsets

     ITEMSET                        SUPPORT                         INTERPRETATION
(mineral water)     	            0.2997          	Most frequently purchased item; a staple produ
(spaghetti)                         0.2296              Popular grocery item, often paired with cooking ingredients
(eggs),(chocolate),(milk)          0.17–0.21            Common daily-use items appearing frequently in baskets
(ground beef),(frozen vegetables)  0.13–0.14            Typically used together in meal preparation

Insight:
Basic household items like mineral water, milk, and eggs dominate purchases, while meal-prep items 
(spaghetti, beef, frozen vegetables) suggest planned cooking.

3. Observations from Association Rules

ANTECEDENT → CONSEQUENT	            CONFIDENCE     LIFT	                INTERPRETATION
(ground beef) → (spaghetti)           0.411        1.79    Customers buying beef are 1.79× more likely to buy spaghetti — 
                                                           classic pasta pairing
(olive oil) → (spaghetti)             0.37         1.61    Italian cuisine relationship — olive oil often used in pasta 
                                                           dishes
(frozen vegetables) → (milk)          0.26         1.54    Indicates healthy meal shoppers also buying essentials
(soup) → (mineral water)              0.47         1.57    Suggests ready-meal buyers often purchase water alongside
(eggs) → (burgers)                    0.31         1.52    Customers may be planning quick, protein-rich meals

Insight:
High lift (>1) across rules shows strong positive correlations. These patterns can guide:
-> Product bundling: (Spaghetti + Olive oil + Ground beef)
-> Store placement: Items that appear together should be nearby.
-> Cross-selling: Recommend "spaghetti" to buyers of "olive oil."

4. Customer Behavior Insights

-> Customers tend to buy in themes — e.g., ingredients for a specific meal rather than random grocery items.

-> Health-conscious shoppers often buy frozen vegetables, milk, and mineral water.

-> Convenience-focused buyers choose ready foods like soup, burgers, and water.

5. Key Learnings

-> Association Rule Mining reveals hidden co-purchase patterns.

-> Lift helps identify meaningful relationships beyond random chance.

-> The discovered rules can directly improve:

    -> Product recommendations

    -> Inventory management

    -> Promotional offers

"""

In [None]:
"""

Interview Questions:

1.	What is lift and why is it important in Association rules?

-> Lift measures how much more likely two items are purchased together compared to if they were independent.

-> It shows the strength of association between products.

-> If lift = 1 - No association (A and B are independent)
   If lift > 1 - Positive association (A and B are often bought together)
   If lift < 1 - Negative association (A and B rarely bought together)

-> Lift is important in Association rules, because it helps identify truly meaningful relationships — not just frequent 
   co-occurrences.

2.	What is support and Confidence. How do you calculate them?

Support:

-> Support shows how frequently an item or itemset occurs in all transactions.

Support(A) = Number of transactions containing A / Total transactions

Example:
If “milk” appears in 30 out of 100 baskets → Support = 0.30 (30%).

Confidence:

-> Confidence indicates how often items in B appear in transactions that contain A.

Confidence(A → B) = Support(A ∩ B) / Support(A)

Example:
If 20 of those 30 "milk" baskets also include "bread",
→ Confidence = 20/30 = 0.67 (67%).

Why important:
-> Support filters out rare rules.
-> Confidence measures rule reliability.


3.	What are some limitations or challenges of Association rules mining?

a. Large number of rules:

  -> Even small datasets can generate thousands of rules — many of which are redundant or uninteresting.

b. Threshold sensitivity:

  -> Results depend heavily on chosen support, confidence, and lift thresholds.

  -> Too high -> miss interesting rules.

  -> Too low -> get noisy, irrelevant rules.

c. Does not capture causation:

  -> Just because two items appear together doesn’t mean one causes the other.

d. Scalability:

  -> Apriori can be computationally expensive for large datasets due to multiple database scans.

e. Lack of temporal context:

  -> Doesn't consider when items were bought — only that they co-occurred.

"""