In [7]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import transactionencoder

In [14]:
data_raw = [
    ['science', 'engineering', 'medical', 'management', 'arts', 'agriculture'],
    ['commerce', 'engineering', 'medical', 'management', 'arts', 'agriculture'],
    ['commerce', 'hotel management', 'management', 'arts'],
    ['science', 'pharmacy', 'law', 'arts', 'agriculture'],
    ['law', 'engineering', 'management', 'physic', 'arts']
]

In [15]:
# fix data formating before convert into dataframe
te = transactionencoder.TransactionEncoder()
te_ary = te.fit(data_raw).transform(data_raw)

In [16]:
# convert raw data into data frame
dataset = pd.DataFrame(te_ary, columns=te.columns_)
dataset.head()

Unnamed: 0,agriculture,arts,commerce,engineering,hotel management,law,management,medical,pharmacy,physic,science
0,True,True,False,True,False,False,True,True,False,False,True
1,True,True,True,True,False,False,True,True,False,False,False
2,False,True,True,False,True,False,True,False,False,False,False
3,True,True,False,False,False,True,False,False,True,False,True
4,False,True,False,True,False,True,True,False,False,True,False


In [17]:
# search the transaction with support >= 0.6
frequent_itemset = apriori(dataset, min_support=0.6, use_colnames=True)
frequent_itemset

Unnamed: 0,support,itemsets
0,0.6,(agriculture)
1,1.0,(arts)
2,0.6,(engineering)
3,0.8,(management)
4,0.6,"(arts, agriculture)"
5,0.6,"(arts, engineering)"
6,0.8,"(arts, management)"
7,0.6,"(management, engineering)"
8,0.6,"(management, arts, engineering)"


In [18]:
# filter the transaction with confidence >= 0.5
# show the dataframe
result = association_rules(frequent_itemset, metric="confidence", min_threshold=0.5)
result

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(arts),(agriculture),1.0,0.6,0.6,0.6,1.0,0.0,1.0
1,(agriculture),(arts),0.6,1.0,0.6,1.0,1.0,0.0,inf
2,(arts),(engineering),1.0,0.6,0.6,0.6,1.0,0.0,1.0
3,(engineering),(arts),0.6,1.0,0.6,1.0,1.0,0.0,inf
4,(arts),(management),1.0,0.8,0.8,0.8,1.0,0.0,1.0
5,(management),(arts),0.8,1.0,0.8,1.0,1.0,0.0,inf
6,(management),(engineering),0.8,0.6,0.6,0.75,1.25,0.12,1.6
7,(engineering),(management),0.6,0.8,0.6,1.0,1.25,0.12,inf
8,"(arts, management)",(engineering),0.8,0.6,0.6,0.75,1.25,0.12,1.6
9,"(management, engineering)",(arts),0.6,1.0,0.6,1.0,1.0,0.0,inf


In [19]:
# simplify the dataframe
result_simplify = result[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
result_simplify

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(arts),(agriculture),0.6,0.6,1.0
1,(agriculture),(arts),0.6,1.0,1.0
2,(arts),(engineering),0.6,0.6,1.0
3,(engineering),(arts),0.6,1.0,1.0
4,(arts),(management),0.8,0.8,1.0
5,(management),(arts),0.8,1.0,1.0
6,(management),(engineering),0.6,0.75,1.25
7,(engineering),(management),0.6,1.0,1.25
8,"(arts, management)",(engineering),0.6,0.75,1.25
9,"(management, engineering)",(arts),0.6,1.0,1.0
