# Association Rules

Definition : concept/algorithm to uncover interesting pattern, relationship or associations between items within large datasets

In general definition, Association Rules uncover A -> B relationship using these 3 parameters

**Support measures how frequently the itemset appears in the dataset**<br>
**Support: $\frac{\text{freq}(X, Y)}{N}$**


**Confidence measures how often the items in B appear in transactions that contain A**<br>
**Confidence**: $\frac{\text{freq}(X, Y)}{\text{freq}(X)}$

**Lift measures how much more likely B is to occur when A has occurred, compared to the occurrence of B in a random transaction**<br>
**Lift**: $\frac{\text{Confidence}(X \rightarrow Y)}{\text{Support}(Y)}$

In [11]:
# Import Library

import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

In [12]:
# Import Data

df_2 = pd.read_csv('./Downloads/DS Dasar Dataset/Association Rule Data.csv',encoding='latin-1')

In [13]:
# Check Data Types and Missing Values

df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19504 entries, 0 to 19503
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   eff_dt       19504 non-null  object 
 1   merchant_nm  19504 non-null  object 
 2   cust_no      19504 non-null  float64
dtypes: float64(1), object(2)
memory usage: 457.2+ KB


In [14]:
# Check Duplicate Values

print('Number of Duplicate Data:',df_2.duplicated().sum())
df_2 = df_2.drop_duplicates()
print('Duplicate Data have been deleted')


Number of Duplicate Data: 1321
Duplicate Data have been deleted


In [15]:
# Change Date Type to Date Format

df_2['Date'] = pd.to_datetime(df_2['eff_dt'], format='%m/%d/%Y')

In [16]:
df_2.sort_values(by='Date', ascending=False)

Unnamed: 0,eff_dt,merchant_nm,cust_no,Date
9689,8/31/2017,SATE KHAS SENAYAN,2158692.2,2017-08-31
18321,8/31/2017,KOPI KENANGAN,1285188.2,2017-08-31
6294,8/31/2017,SUSHI MOI,1276820.2,2017-08-31
3022,8/31/2017,SOGO,341016.0,2017-08-31
3023,8/31/2017,SOGO,823718.0,2017-08-31
...,...,...,...,...
17180,8/1/2017,COCO ICHIBANYA,2336960.4,2017-08-01
15172,8/1/2017,IKKUDO ICHI,59603.6,2017-08-01
12056,8/1/2017,H&M,688391.6,2017-08-01
4960,8/1/2017,DUNIA ANAK,969001.8,2017-08-01


In [17]:
# Group by Data Based on Date and Customer

df_1 = df_2.groupby(['Date', 'cust_no'])['merchant_nm'].apply(lambda x: ';'.join(x)).reset_index()

In [18]:
# Change value into list
data = list(df_1["merchant_nm"].apply(lambda x:x.split(";") ))

# Change data into (0, True)
encoder = TransactionEncoder()
data_transform = encoder.fit(data).transform(data)
df = pd.DataFrame(data_transform,columns=encoder.columns_)
df = df.replace(False,0)

# Get support value each combination
df_apriori = apriori(df, min_support = 0.001, use_colnames = True, verbose = 1)



# Get Lift and Confidence value
df_ar = association_rules(df_apriori, metric = "confidence", min_threshold = 0.15)

# Sort by top support of Combinations
rules = df_ar.sort_values(by='confidence', ascending=False)




Processing 1779 combinations | Sampling itemset size 32


In [19]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
8,(MANGO),(H&M),0.008525,0.053373,0.002001,0.234783,4.398913,0.001546,1.23707,0.779315
9,(MANGO),(ZARA),0.008525,0.051964,0.002001,0.234783,4.518142,0.001558,1.23891,0.785365
10,(STRADIVARIUS),(ZARA),0.010897,0.051964,0.00215,0.197279,3.796423,0.001583,1.181027,0.744709
1,(TOUS LE JOUS),(AEON SUPERMARKET),0.00897,0.073684,0.001705,0.190083,2.579693,0.001044,1.143716,0.617899
6,(G 2000),(ZARA),0.00682,0.051964,0.00126,0.184783,3.555945,0.000906,1.162924,0.723716
0,(BEST DENKI),(ACE HARDWARE),0.006227,0.034248,0.001038,0.166667,4.866522,0.000825,1.158903,0.799493
3,(G 2000),(FOREVER 21),0.00682,0.012602,0.001112,0.163043,12.93798,0.001026,1.179748,0.929044
4,(FOREVER 21),(H&M),0.012602,0.053373,0.002001,0.158824,2.975735,0.001329,1.125361,0.672422
7,(GUARDIAN),(GIANT SUPERMARKET),0.02083,0.114603,0.003188,0.153025,1.335256,0.0008,1.045363,0.256421
5,(FOREVER 21),(ZARA),0.012602,0.051964,0.001927,0.152941,2.94319,0.001273,1.119209,0.668659


In [20]:
# Visualize Rules

def draw_graph_interactive(rules, rules_to_show):
    G = nx.DiGraph()
    
    # Add nodes and edges with attributes
    for i in range(rules_to_show):
        rule_node = "R" + str(i)
        G.add_node(rule_node, title=f"Rule {i}", support=rules.iloc[i]['support'], confidence=rules.iloc[i]['confidence'], lift=rules.iloc[i]['lift'])
        
        for antecedent in rules.iloc[i]['antecedents']:
            G.add_node(antecedent, title=f"Antecedent: {antecedent}")
            G.add_edge(antecedent, rule_node, weight=rules.iloc[i]['confidence'], title=f"Confidence: {rules.iloc[i]['confidence']:.2f}")
            
        for consequent in rules.iloc[i]['consequents']:
            G.add_node(consequent, title=f"Consequent: {consequent}")
            G.add_edge(rule_node, consequent, weight=rules.iloc[i]['confidence'], title=f"Confidence: {rules.iloc[i]['confidence']:.2f}")
    
    # Create a pyvis network
    net = Network(notebook=True, height='750px', width='100%', bgcolor='white', font_color='black', cdn_resources='remote')
    
    # Load the networkx graph
    net.from_nx(G)
    
    # Customize the nodes
    for node in net.nodes:
        node['value'] = G.nodes[node['id']].get('support', 1) * 1000
        node['color'] = 'blue' if node['id'].startswith('R') else 'green'
    
    # Display the network
    return net.show('association_rules.html')

# Example usage with the top 10 rules based on lift
draw_graph_interactive(rules, 10)

association_rules.html
