# Assocation Rule Mining

In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pdb # for debugging

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Dataset

df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/market-basket.csv')
df.head()

Unnamed: 0,Item1,Item2,Item3,Item4
0,Milk,Bread,Eggs,
1,Milk,Diapers,Beer,Eggs
2,Milk,Bread,Diapers,Beer
3,Bread,Diapers,Eggs,
4,Milk,Bread,Diapers,Eggs


In [3]:
# Data Preprocessing
# Before Applying the Apriori algorithm, we need to preprocess the data
# One-Hot Encoding, Remember get dummies? (but this is different)

from mlxtend.preprocessing import TransactionEncoder

# Consolidate each transaction into a single list of items, removing NaN values
transactions = df.apply(lambda row: row.dropna().tolist(), axis = 1).tolist()

# Initialize TransactionEncoder
encoder = TransactionEncoder()

# Fit and transform the transactions data
transaction_matrix = encoder.fit_transform(transactions)

# Convert to DataFrame
transaction_df = pd.DataFrame(transaction_matrix, columns = encoder.columns_)
transaction_df

Unnamed: 0,Beer,Bread,Diapers,Eggs,Milk
0,False,True,False,True,True
1,True,False,True,True,True
2,True,True,True,False,True
3,False,True,True,True,False
4,False,True,True,True,True


In [4]:
# Appying the Apriori Algorithm
# Since data are cleaned and prepared for frequent itemset

from mlxtend.frequent_patterns import apriori, association_rules

# Apply the Apriori algorithm
frequent_itemsets = apriori(transaction_df, min_support = 0.5, use_colnames = True)

# min_support is the minimum support threshold. Itemsets with support greater than or equal to this threshold will be returned.
#use_colnames = True ensures that the item names are used in the output instead of column indices.

In [5]:
# View Frequent Itemsets

import warnings
warnings.filterwarnings('ignore', 'all')

frequent_itemsets

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.8,(Bread)
1,0.8,(Diapers)
2,0.8,(Eggs)
3,0.8,(Milk)
4,0.6,"(Diapers, Bread)"
5,0.6,"(Bread, Eggs)"
6,0.6,"(Milk, Bread)"
7,0.6,"(Diapers, Eggs)"
8,0.6,"(Diapers, Milk)"
9,0.6,"(Milk, Eggs)"


In [6]:
# Generate Association Rules

pd.set_option('display.max_columns', 100)

rules = association_rules(frequent_itemsets, num_itemsets = len(transaction_df), metric = "confidence", min_threshold = 0.7)
rules.loc[:, :'lift']

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(Diapers),(Bread),0.8,0.8,0.6,0.75,0.9375
1,(Bread),(Diapers),0.8,0.8,0.6,0.75,0.9375
2,(Bread),(Eggs),0.8,0.8,0.6,0.75,0.9375
3,(Eggs),(Bread),0.8,0.8,0.6,0.75,0.9375
4,(Milk),(Bread),0.8,0.8,0.6,0.75,0.9375
5,(Bread),(Milk),0.8,0.8,0.6,0.75,0.9375
6,(Diapers),(Eggs),0.8,0.8,0.6,0.75,0.9375
7,(Eggs),(Diapers),0.8,0.8,0.6,0.75,0.9375
8,(Diapers),(Milk),0.8,0.8,0.6,0.75,0.9375
9,(Milk),(Diapers),0.8,0.8,0.6,0.75,0.9375


In [7]:
# INTERPRETATIONS:

# RULE 1: (Diapers) --> (Bread)

# Antecedent Support: 0.8 (Diapers appear in 80% of all transactions)

# Consequent Support: 0.8 (Bread appears in 80% of all transactions)

# Support: 0.6 (Diapers and Bread are bought together in 60% of transactions)

# Confidence: 0.75 (If someone buys Diapers, there’s a 75% chance they’ll also buy Bread)

# Lift: 0.9375 (Slightly less than 1, suggesting no strong dependence between Diapers and Bread;

  and should_run_async(code)


In [8]:
# RULE 4: (Bread) --> (Eggs)

# Antecedent Support: 0.8 (Bread appears in 80% of all transactions)

# Consequent Support: 0.8 (Eggs appear in 80% of all transactions)

# Support: 0.6 (Bread and Eggs are bought together in 60% of transactions)

# Confidence: 0.75 (75% of Bread buyers also buy Eggs)

# Lift: 0.9375 (Again, the lift is slightly below 1, meaning that while Bread and Eggs co-occur often,
# this may be due to their individual popularity rather than a dependency).

  and should_run_async(code)
