In [1]:
import pandas as pd
import numpy as np
from statistics import median
from sklearn.preprocessing import OneHotEncoder
import sys
np.set_printoptions(threshold=sys.maxsize)
from apyori import apriori
import pyfpgrowth

In [2]:
data12 = pd.read_table('chess.dat', names=['Items'])
data12

Unnamed: 0,Items
0,1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 ...
1,1 3 5 7 9 12 13 15 17 19 21 23 25 27 29 31 34 ...
2,1 3 5 7 9 12 13 16 17 19 21 23 25 27 29 31 34 ...
3,1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 ...
4,1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 ...
...,...
3191,2 4 5 7 9 11 13 16 17 19 21 23 26 27 29 33 34 ...
3192,2 4 5 7 9 11 13 16 17 19 21 23 26 27 29 33 34 ...
3193,2 4 5 7 9 11 13 16 17 19 21 23 26 27 29 31 34 ...
3194,2 4 5 8 9 11 13 16 17 19 21 23 26 27 30 33 35 ...


In [3]:
data12['Items'] = data12['Items'].replace(',', ' ')
data12

Unnamed: 0,Items
0,1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 ...
1,1 3 5 7 9 12 13 15 17 19 21 23 25 27 29 31 34 ...
2,1 3 5 7 9 12 13 16 17 19 21 23 25 27 29 31 34 ...
3,1 3 5 7 9 11 13 15 17 20 21 23 25 27 29 31 34 ...
4,1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 34 ...
...,...
3191,2 4 5 7 9 11 13 16 17 19 21 23 26 27 29 33 34 ...
3192,2 4 5 7 9 11 13 16 17 19 21 23 26 27 29 33 34 ...
3193,2 4 5 7 9 11 13 16 17 19 21 23 26 27 29 31 34 ...
3194,2 4 5 8 9 11 13 16 17 19 21 23 26 27 30 33 35 ...


In [7]:
data12 = pd.read_excel("transaction_data.xlsx")
data12

Unnamed: 0,Transaction ID,Items
0,T1,"Apple, Orange, Grape"
1,T2,"Apple, Orange"
2,T3,"Mango, Pineapple, Papaya"
3,T4,"Apple, Papaya"
4,T5,"Grape, Orange, Pineapple"


In [4]:
def fpgrowthFromFile(fname, minSupRatio, minConf):
    itemSetList, frequency = getFromFile(fname)
    minSup = len(itemSetList) * minSupRatio
    fpTree, headerTable = constructTree(itemSetList, frequency, minSup)

    freqItems = []
    mineTree(headerTable, minSup, set(), freqItems)
    rules = associationRule(freqItems, itemSetList, minConf)
    return freqItems, rules

In [5]:
def constructTree(itemSetList, frequency, minSup):
    headerTable = defaultdict(int)
    # Counting frequency and create header table
    for idx, itemSet in enumerate(itemSetList):
        for item in itemSet:
            headerTable[item] += frequency[idx]

    # Deleting items below minSup
    headerTable = dict((item, sup) for item, sup in headerTable.items() if sup >= minSup)
    if(len(headerTable) == 0):
        return None, None

    # HeaderTable column [Item: [frequency, headNode]]
    for item in headerTable:
        headerTable[item] = [headerTable[item], None]

    # Init Null head node
    fpTree = Node('Null', 1, None)
    # Update FP tree for each cleaned and sorted itemSet
    for idx, itemSet in enumerate(itemSetList):
        itemSet = [item for item in itemSet if item in headerTable]
        itemSet.sort(key=lambda item: headerTable[item][0], reverse=True)
        # Traverse from root to leaf, update tree with given item
        currentNode = fpTree
        for item in itemSet:
            currentNode = updateTree(item, currentNode, headerTable, frequency[idx])

    return fpTree, headerTable

def updateTree(item, treeNode, headerTable, frequency):
    if item in treeNode.children:
        # If the item already exists, increment the count
        treeNode.children[item].increment(frequency)
    else:
        # Create a new branch
        newItemNode = Node(item, frequency, treeNode)
        treeNode.children[item] = newItemNode
        # Link the new branch to header table
        updateHeaderTable(item, newItemNode, headerTable)

    return treeNode.children[item]

def updateHeaderTable(item, targetNode, headerTable):
    if(headerTable[item][1] == None):
        headerTable[item][1] = targetNode
    else:
        currentNode = headerTable[item][1]
        # Traverse to the last node then link it to the target
        while currentNode.next != None:
            currentNode = currentNode.next
        currentNode.next = targetNode

In [6]:
def mineTree(headerTable, minSup, preFix, freqItemList):
    # Sort the items with frequency and create a list
    sortedItemList = [item[0] for item in sorted(list(headerTable.items()), key=lambda p:p[1][0])] 
    # Start with the lowest frequency
    for item in sortedItemList:  
        # Pattern growth is achieved by the concatenation of suffix pattern with frequent patterns generated from conditional FP-tree
        newFreqSet = preFix.copy()
        newFreqSet.add(item)
        freqItemList.append(newFreqSet)
        # Find all prefix path, constrcut conditional pattern base
        conditionalPattBase, frequency = findPrefixPath(item, headerTable) 
        # Construct conditonal FP Tree with conditional pattern base
        conditionalTree, newHeaderTable = constructTree(conditionalPattBase, frequency, minSup) 
        if newHeaderTable != None:
            # Mining recursively on the tree
            mineTree(newHeaderTable, minSup,
                       newFreqSet, freqItemList)

def findPrefixPath(basePat, headerTable):
    # First node in linked list
    treeNode = headerTable[basePat][1] 
    condPats = []
    frequency = []
    while treeNode != None:
        prefixPath = []
        # From leaf node all the way to root
        ascendFPtree(treeNode, prefixPath)  
        if len(prefixPath) > 1:
            # Storing the prefix path and it's corresponding count
            condPats.append(prefixPath[1:])
            frequency.append(treeNode.count)

        # Go to next node
        treeNode = treeNode.next  
    return condPats, frequency

def ascendFPtree(node, prefixPath):
    if node.parent != None:
        prefixPath.append(node.itemName)
        ascendFPtree(node.parent, prefixPath)

In [8]:
fpgrowthFromFile('transaction_data.xlsx', 10, 0.8)

NameError: name 'getFromFile' is not defined