In [1]:
!pip install mlxtend # its a ml library for association library

Collecting mlxtend
[?25l  Downloading https://files.pythonhosted.org/packages/16/e6/30e50ed9c053a1530c83149090e1f5fd9fccc8503dca2ecce1bb52f34de0/mlxtend-0.15.0.0-py2.py3-none-any.whl (1.3MB)
[K    100% |████████████████████████████████| 1.3MB 31kB/s ta 0:00:011
Installing collected packages: mlxtend
Successfully installed mlxtend-0.15.0.0


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import networkx as nx

In [15]:
transaction = pd.read_csv('./sample_transaction.csv')
transaction.head()

Unnamed: 0,TID,DESC
0,101,"Milk,Onion,Nutmeg,Kidney Beans,Eggs,Yogurt"
1,102,"Dill,Onion,Nutmeg,Kidney Beans,Eggs,Yogurt"
2,103,"Milk,Apple,Kidney Beans,Eggs"
3,104,"Milk,Unicorn,Corn,Kidney Beans,Yogurt"
4,105,"Corn,Onion,Onion,Kidney Beans,Ice cream,Eggs"


In [16]:
def intolist(desc):
    return desc.split(',')

In [17]:
transaction['DESC'] = transaction['DESC'].apply(intolist)
transaction.head()

Unnamed: 0,TID,DESC
0,101,"[Milk, Onion, Nutmeg, Kidney Beans, Eggs, Yogurt]"
1,102,"[Dill, Onion, Nutmeg, Kidney Beans, Eggs, Yogurt]"
2,103,"[Milk, Apple, Kidney Beans, Eggs]"
3,104,"[Milk, Unicorn, Corn, Kidney Beans, Yogurt]"
4,105,"[Corn, Onion, Onion, Kidney Beans, Ice cream, ..."


# one hot encoding with bolean values

In [18]:
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder() # list of transcation into DF

In [19]:
bolean_tr =te.fit_transform(transaction['DESC'])
df = pd.DataFrame(data = bolean_tr, columns=te.columns_,index=transaction['TID'])
df

Unnamed: 0_level_0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
TID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101,False,False,False,True,False,True,True,True,True,False,True
102,False,False,True,True,False,True,False,True,True,False,True
103,True,False,False,True,False,True,True,False,False,False,False
104,False,True,False,False,False,True,True,False,False,True,True
105,False,True,False,True,True,True,False,False,True,False,False


In [20]:
# Association rules , Apriori
from mlxtend.frequent_patterns import apriori , association_rules

In [21]:
df_ap = apriori(df,min_support=0.5, use_colnames=True)
df_ap

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Kidney Beans, Milk)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


In [22]:
# Applying rules
# if lift =1 independent 
# if lift >1 then it is correlated
# if lift is <1 then it is not correlated
# antecedents = input :: consequents = output
df_ar = association_rules(df_ap,metric='confidence',min_threshold=0.6)
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
1,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
2,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Kidney Beans),(Milk),1.0,0.6,0.6,0.6,1.0,0.0,1.0
5,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
6,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
7,(Kidney Beans),(Onion),1.0,0.6,0.6,0.6,1.0,0.0,1.0
8,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
9,(Kidney Beans),(Yogurt),1.0,0.6,0.6,0.6,1.0,0.0,1.0


In [26]:
# Applying rules
# using metrix lift
df_lf = association_rules(df_ap,metric='lift',min_threshold=1.1)
df_lf

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
1,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Onion),"(Kidney Beans, Eggs)",0.6,0.8,0.6,1.0,1.25,0.12,inf
5,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6


In [18]:
tree = df_lf[['antecedents','consequents']]
tree

Unnamed: 0,antecedents,consequents
0,(Eggs),(Onion)
1,(Onion),(Eggs)
2,"(Eggs, Kidney Beans)",(Onion)
3,"(Kidney Beans, Onion)",(Eggs)
4,(Eggs),"(Kidney Beans, Onion)"
5,(Onion),"(Eggs, Kidney Beans)"
