In [1]:
!pip install mlxtend



In [2]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Preprocessing:

In [3]:
# Importing the dataset
data = pd.read_excel('./Online Retail.xlsx')
data.head()

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


In [4]:
# make the copy of original dataset so it should not get effercted by the differnt operations on the dataset 
retail_data = data.copy()

## Spliting of dataset 

In [5]:
# Split the items in each transaction into a list format
retail_data['Transactions'] = retail_data.iloc[:, 0].apply(lambda x: x.split(','))

In [6]:
print(len(retail_data['Transactions']))

7500


## Null values 

In [7]:
retail_data.isnull().sum()
# null values are not present in our dataset 

shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil    0
Transactions                                                                                                                                                                                                                       0
dtype: int64

## Duplicates vlaues handling 

In [8]:
#since the duplicates values will not provide in values to our model we will simply remove it 
# Remove duplicates
retail_data.drop_duplicates(subset=['Transactions'], inplace=True)

## One-hot encoding 

In [9]:
from mlxtend.preprocessing import TransactionEncoder

In [10]:
# Extract the transactions column and convert it into the list format for the formation of association rule 
transactions = retail_data['Transactions'].tolist()

In [11]:
# One-hot encode the transaction data
te = TransactionEncoder()
te_data = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_data, columns=te.columns_)

In [12]:
df_encoded

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5170,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5171,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
5172,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5173,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Association Rule

In [13]:
from mlxtend.frequent_patterns import apriori, association_rules

In [14]:
# Generate frequent itemsets with a minimum support threshold
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Generate association rules with a minimum confidence threshold
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)

# Filter rules by lift
rules = rules[rules['lift'] >= 1.2]

# Sort rules by confidence in descending order
rules_sorted = rules.sort_values(by='confidence', ascending=False)

rules_sorted

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
8,"(milk, soup)",(mineral water),0.021449,0.29971,0.012367,0.576577,1.923781,0.005939,1.653876,0.490716
3,"(frozen vegetables, ground beef)",(mineral water),0.024541,0.29971,0.013333,0.543307,1.812775,0.005978,1.533393,0.45964
9,"(spaghetti, soup)",(mineral water),0.020676,0.29971,0.010821,0.523364,1.746235,0.004624,1.469236,0.436362
6,"(pancakes, ground beef)",(mineral water),0.02087,0.29971,0.010821,0.518519,1.730067,0.004566,1.454448,0.430982
0,"(chocolate, chicken)",(mineral water),0.021256,0.29971,0.011014,0.518182,1.728943,0.004644,1.453432,0.430768
7,"(milk, olive oil)",(mineral water),0.024155,0.29971,0.012367,0.512,1.708317,0.005128,1.43502,0.424892
4,"(frozen vegetables, ground beef)",(spaghetti),0.024541,0.229565,0.01256,0.511811,2.22948,0.006927,1.578149,0.565339
1,"(chocolate, olive oil)",(mineral water),0.023575,0.29971,0.011981,0.508197,1.695627,0.004915,1.423923,0.420153
5,"(milk, ground beef)",(mineral water),0.031691,0.29971,0.016039,0.506098,1.688623,0.006541,1.417871,0.421148
2,"(eggs, ground beef)",(mineral water),0.028792,0.29971,0.014493,0.503356,1.679475,0.005863,1.410043,0.41657


##### Insights from Association Rule Mining

##### Common Consequent
- **Mineral Water** frequently appears as a consequent in the association rules, indicating it is often purchased alongside other items.

##### Top Associations
- **Milk & Soup**: Customers who buy these are likely to also purchase mineral water.
- **Frozen Vegetables & Ground Beef**: Customers who buy these are likely to also purchase mineral water.
- **frozen vegetables, ground beef**: It is the only rule where custerm likely to purchess the spaghetti with with these items.

##### Lift and Confidence
- All identified rules have:
  - **Lift**: Greater than 1.2
  - **Confidence**: At least 0.5
- This indicates strong and meaningful associations between the products.


# Analysis and Interpretation of Association Rules

#### Frequent Co-Purchases

1. **Milk and Soup with Mineral Water**:
   - Suggests that customers often buy these items together, indicating they might be preparing meals or stocking up on essentials.

2. **Frozen Vegetables and Ground Beef with Mineral Water**:
   - Implies that customers are likely preparing balanced meals, combining protein and vegetables, and prefer mineral water as a beverage.

3. **Spaghetti and Soup with Mineral Water**:
   - Indicates that these items are part of a quick meal plan, with mineral water being a preferred drink option.

#### Customer Preferences

- **Health-Conscious Choices**:
  - The frequent appearance of mineral water suggests a preference for healthier options over sugary drinks. This is a valuable insight for promoting health-related products.

- **Meal Planning**:
  - The combinations of items indicate that customers are planning meals. Retailers could leverage this by creating meal kits or offering discounts on these combinations to encourage bulk buying.

#### Insights into Customer Purchasing Behavior

- **Health and Convenience**:
  - Customers prefer healthy options like mineral water and are likely looking for convenient, balanced meal solutions.




# Interview Questions:

#### Q.1) What is lift and why is it important in Association rules?
- **Lift** is a measure of how much more likely two items are to be bought together than if they were bought independently. It is calculated as:

- forlmula = **lift(A->B) = confidence(A->b)/support(B)**

- **Importance:** Lift helps to identify strong associations between items. A lift value greater than 1 indicates a positive association, meaning the items are more likely to be bought together than separately. A lift value less than 1 indicates a negative association, and a lift value equal to 1 indicates no association.

#### Q.2) What is support and confidence? How do you calculate them?

* **Support**: Support measures how frequently an itemset appears in the dataset. It is calculated as:
        
     - **support(A = Number of transactions containing A / Total number of transactions**
     - Example: If "milk" appears in 10 out of 100 transactions, the support for "milk" is  10%.

* **Confidence:** Confidence measures the likelihood that item B is purchased when item A is purchased. It is calculated as:

     - **Confidence(A->B) = confidence(A U b)/support(A)**
     - Example: If "milk and bread" appear together in 50 out of 100 transactions that contain "milk", the confidence for "milk -> bread" is 50%.

#### Q.3)What are some limitations or challenges of Association rules mining?
- **Scalability:** Association rule mining can be computationally expensive, especially with large datasets, as the number of possible itemsets grows exponentially.
- **Sparsity:** In datasets with many items, the data can be very sparse, making it difficult to find meaningful associations.
- **Interpretability:** The large number of generated rules can be overwhelming, and not all rules are useful or actionable.
- **Threshold Setting:** Choosing appropriate thresholds for support, confidence, and lift can be challenging and may require domain knowledge.
- **Redundancy:** Many rules can be redundant or trivial, providing little new information.
- **Overfitting:** Rules may fit the training data well but may not generalize to new data, leading to overfitting.