# Association mining using Apriori

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from mlxtend import frequent_patterns

%matplotlib inline

In [58]:
df = pd.read_csv("./dataset.csv")

In [59]:
df.head()

Unnamed: 0,TID,List
0,T100,"I1, I2, I5"
1,T200,"I2, I4"
2,T300,"I2, I3"
3,T400,"I1, I2, I4"
4,T500,"I1, I3"


In [60]:
def convertToOnehot(df, col):
    return df.join(df[col].str.get_dummies(", ")).drop(["List"], axis=1)

In [61]:
def getCount(df, col):
    return len(df.loc[df[col] == 1])

In [62]:
def combineCols(df, col1, col2):
    return (df[col1] & df[col2])

In [63]:
def checkForCombinations(combList, combTuple, r):
    isPresent = True
    for comb in combinations(combTuple, r):
        if comb not in combList:
            isPresent = False
            break
    return isPresent

In [64]:
def apriori(df, col, s_count):
    df = pd.DataFrame(df[col])
    df = convertToOnehot(df, col)
    items = df.columns
    countDf = df.sum()
    countDict = countDf.loc[countDf >= s_count].to_dict()
    print(countDict)
    items = list(countDict.keys())
    combHist = []
    combPrev = list(combinations(items, 1))
    combNext = []
    for i in range(2, len(items) - 1):
        for comb in combinations(items, i):
            if checkForCombinations(combPrev, comb, (i-1)):
                combProp = "_".join(str(c) for c in comb)
                col1 = "_".join(str(c) for c in comb[:-1])
                col2 = str(comb[-1])
                df[combProp] = combineCols(df, col1, col2)
                countTemp = getCount(df, combProp)
                if countTemp >= s_count:
                    combNext.append(comb)
                    countDict[combProp] = getCount(df, combProp)
        combHist.append(combPrev)
        combPrev = combNext
    return countDict

In [65]:
def getConfidence(items_given, support_items, countDict):
    items_given.sort()
    support_items = support_items + items_given
    support_items.sort()
    items_given_str = "_".join(items_given)
    item_support_str = "_".join(support_items)
    item_support = item_support_str
    items = list(countDict.keys())
    if (items_given_str not in items) or (item_support not in items):
        return 0
    else:
        return (countDict[item_support]/ countDict[items_given_str])

In [66]:
countDict = apriori(df, 'List', 2)
getConfidence(["I2", "I1"], ["I5"], countDict)

{'I1': 6, 'I2': 7, 'I3': 6, 'I4': 2, 'I5': 2}


0.5

In [67]:
elements = [s.split("_") for s in countDict.keys() if len(s) > 2]

In [68]:
minConfidence = 0.7

In [70]:
def getAllConfidence(elements, countDict, minConfidence):
    for element in elements:
        for comb in combinations(element, len(element) - 1):
            comb = list(comb)
            remaining = list(set(element) - set(comb))
            if getConfidence(comb, remaining, countDict) > minConfidence:
                print(comb,"->",remaining, getConfidence(comb, remaining, countDict))
            if len(element) > 2 and (getConfidence(remaining, comb, countDict) > minConfidence):
                print(remaining,"->",comb, getConfidence(remaining, comb, countDict))

In [71]:
getAllConfidence(elements,countDict, 0.7)

['I5'] -> ['I1'] 1.0
['I4'] -> ['I2'] 1.0
['I5'] -> ['I2'] 1.0
['I5'] -> ['I1', 'I2'] 1.0
['I1', 'I5'] -> ['I2'] 1.0
['I2', 'I5'] -> ['I1'] 1.0


In [22]:
oneHotDf = convertToOnehot(df, 'List')
cols = ['I1', 'I2', 'I3', 'I4', 'I5']
oneHotDf[cols] = oneHotDf[cols].astype('bool')

In [23]:
oneHotDf = oneHotDf.drop(['TID'], axis=1)

In [24]:
freq_itemsets = frequent_patterns.apriori(oneHotDf, min_support=0.20, use_colnames=True)

In [25]:
rules = frequent_patterns.association_rules(freq_itemsets, metric="confidence", min_threshold=1)
rules[rules.confidence > minConfidence]

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(I5),(I1),0.222222,0.666667,0.222222,1.0,1.5,0.074074,inf
1,(I4),(I2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
2,(I5),(I2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
3,"(I1, I5)",(I2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
4,"(I2, I5)",(I1),0.222222,0.666667,0.222222,1.0,1.5,0.074074,inf
5,(I5),"(I1, I2)",0.222222,0.444444,0.222222,1.0,2.25,0.123457,inf


In [73]:
oneHotDf = oneHotDf.replace([False], '?')

In [27]:
oneHotDf.to_csv("./dataset_onehot.csv")

## Using Monkey dataset

In [36]:
df2 = pd.read_csv('./dataset2.csv')
df2 = df2.drop(['TID'], axis=1)

In [42]:
countDict2 = apriori(df2, 'List', 3)

{'E': 4, 'K': 4, 'M': 3, 'O': 3, 'Y': 3}


In [43]:
elements2 = [s.split("_") for s in countDict2.keys() if len(s) > 2]

In [72]:
getAllConfidence(elements2, countDict2, 0.7)

['E'] -> ['K'] 1.0
['K'] -> ['E'] 1.0
['E'] -> ['O'] 0.75
['O'] -> ['E'] 1.0
['K'] -> ['O'] 0.75
['O'] -> ['K'] 1.0
['E', 'K'] -> ['O'] 0.75
['O'] -> ['E', 'K'] 1.0
['E', 'O'] -> ['K'] 1.0
['K'] -> ['E', 'O'] 0.75
['K', 'O'] -> ['E'] 1.0
['E'] -> ['K', 'O'] 0.75


In [74]:
countDict2

{'E': 4,
 'E_K': 4,
 'E_K_O': 3,
 'E_O': 3,
 'K': 4,
 'K_O': 3,
 'M': 3,
 'O': 3,
 'Y': 3}