In [1]:
import csv
import copy
from itertools import combinations

In [2]:
class Node:
    """
    Define a Node Structure to represent a node of FP-Tree
    """
    def __init__(self,item=None,count=0,parent=None,child=[]):
        self.item = item # an item
        self.count = count# counter of item
        self.parent = parent # indicate the parent node
        self.child = [] # indicate the child node
        self.child_item = [] # differs from child, it suggests what kinds of child the node has. e.g. ['a','b']

class FP_Tree:
    """
    A class to construct FP-Tree and retrieve FP-Tree
    """
    def __init__(self,data,minsup = 2500):
        self.data = data #Store transaction data
        self.minsup = minsup 
        self.root = Node(item='root') #define a root
        self.dictionary = {} # dictionary to record set of items and their frequency
        self.dict_head = {} # head table
        
    def construct_dict(self):
        """
        Step 1:Deduce the ordered frequent items and store it in the dictionary.
        For items with frequency < Minsup, delete it.
        """
        dic = {}
        for item in self.data: #for each line of data
            for element in item:
                # Counter the frequency of item in the for loop
                if dic.get(element) == None:
                    dic[element] = 1
                else:
                    dic[element] +=1
        pruned_dic = {key:val for key, val in dic.items() if val >= self.minsup} # prune the dict
        sorted_dict_tuple = sorted(pruned_dic.items(), key = lambda kv:(kv[1], kv[0]),reverse=True) #sort the dict
        self.dictionary = {key:val for key,val in sorted_dict_tuple}
    
    def sort_item_in_each_transaction(self,item):
        """
        Step 2(a): Given a record, delete items with frequency < Minsup and sort
        the rest of items in descending order.
        """
        item = [i for i in item if i in self.dictionary.keys()] #detect item whether it is in the pruned_dictionary
        n = len(item)
        
        # Use Insertion Sorting Algorithm to sort the items in the record
        for i in range(1,n):
            tmp = i
            #iteratively find the proper location to insert
            while self.dictionary[item[tmp]] > self.dictionary[item[tmp-1]] and tmp > 0: 
                item[tmp],item[tmp-1] = item[tmp-1],item[tmp] 
                tmp -= 1       
        return item

    def construct_fp_tree(self):
        """
        Step 2(b): Construct the FP-Tree iteratively 
        """
        self.dict_head = {key:[] for key, val in self.dictionary.items()} #create head table
        for item in self.data:
            item = self.sort_item_in_each_transaction(item) #call the 2(a) function
            tmp = self.root # create substition of root
            for i in item:
                if i not in tmp.child_item: #if not in the current node's child_item, create a new node
                    node = Node(item=i,count=1,parent=tmp,child=[]) # create node and record parent
                    tmp.child_item.append(i) #record this child item
                    tmp.child.append(node) #record this child node
                    self.dict_head[i].append(node) #record the new node in head table
                else:
                    tmp.child[tmp.child_item.index(i)].count += 1 #if exists, just add 1
                tmp = tmp.child[tmp.child_item.index(i)]  #set the current node to next node
    
    def retrieve_fp_tree_and_count_frequent_itemset(self):
        """
        Step 3/4: Construct FP-conditional tree of each item and Construct cond-fp-tree to retrieve frequent items
        """
        reversed_item = list(self.dict_head.keys())[::-1] #from the bottom of the head table 
        frequent_itemset = {} #record output frequent itemset
        for item in reversed_item:
            frequency_table_list = [] #list to store each subtree
            for leaf in self.dict_head[item]: #for each leaf
                frequency_table = {}
                sup = leaf.count #define the adder
                while leaf.item != 'root': #if not reach the root
                    frequency_table[leaf.item] = sup
                    leaf = leaf.parent #recursively go up
                frequency_table_list.append(frequency_table)
            
            #Merge all the elements in the frequency table list to get the count of each item
            merged_frequency_table = copy.deepcopy(frequency_table_list[0])
            for i in range(1,len(frequency_table_list)):
                for key in frequency_table_list[i].keys():
                    if merged_frequency_table.get(key) == None:
                        merged_frequency_table[key] = frequency_table_list[i][key]
                    else:
                        merged_frequency_table[key] += frequency_table_list[i][key]
            
            #According to the Minsup, prune the merged frequency table (i.e. delete items with frequency < Minsup)
            pruned_merged_frequency_table = {key:val for key, val in merged_frequency_table.items() if val >= self.minsup}
            
            #According to the pruned merged frequency table, delete items with frequency < Minsup in the list of frequency table
            for i in range(0,len(frequency_table_list)):
                frequency_table_list[i] = {key:val for key, val in frequency_table_list[i].items() if key in pruned_merged_frequency_table.keys()}

            #record this one-item first
            frequent_itemset[item] = pruned_merged_frequency_table[item]
            
            
            #generate pattern for each frequency table in frequency table list
            item_sets = list(pruned_merged_frequency_table.keys())
            for nums_of_combination in range(2,len(item_sets)+1): #for each k-itemset, generate possible combinations
                possible_combinations = combinations(item_sets,nums_of_combination) #call the function to generate list of combinations
                for possible_combination in possible_combinations: 
                    support = 0
                    for subtree in frequency_table_list: #check if current itemsets all in current dictionary's keys
                        if set(possible_combination).issubset(set(subtree.keys())) and item in possible_combination: #if all in
                            support += min([subtree.get(item) for item in possible_combination])#add the minimum frequency
                    if support >= self.minsup: #if the count meets the threshold
                        frequent_itemset[possible_combination] = support #output
                                            
        return frequent_itemset
    
    def find_frequent_itemset_in_descending_order(self):
        """
        A main function of the class. Basically it call all the defined function to 
        construct a whole procedure of FP-Tree frequent patter mining.
        """
        self.construct_dict() #First Scan
        self.construct_fp_tree() #Second Scan
        frequent_itemset = self.retrieve_fp_tree_and_count_frequent_itemset() #Retrieve Pattern
        sorted_frequent_itemset = dict(sorted(frequent_itemset.items(), key=lambda item: item[1],reverse=True)) #Sort in desc order
        for key in sorted_frequent_itemset: #Print 
            print(key,':',sorted_frequent_itemset[key])

In [3]:
if __name__ == '__main__':
#     test = [['a','b','c','d','e','f','g','h'],
#        ['a','f','g'],
#        ['b','d','e','f','j'],
#        ['a','b','d','i','k'],
#        ['a','b','e','g']] #test data set

    ###############Data Preparation#############
    csvFile = open("DataSetA.csv", "r")
    reader = csv.reader(csvFile)
    data= [] #data is a list of lists records each transaction
    for item in reader:
        item.remove('')
        data.append(item)
    ###############Data Preparation End#############
        
#     t = FP_Tree(test,minsup=3)
    t = FP_Tree(data,minsup=2500)
    t.find_frequent_itemset_in_descending_order()

Milk : 5526
Ghee : 5510
Coffee Powder : 5509
Yougurt : 5503
Bread : 5484
Sweet : 5483
Sugar : 5482
Butter : 5481
Cheese : 5476
Panner : 5444
Lassi : 5432
Tea Powder : 5383
('Coffee Powder', 'Ghee') : 2578
('Lassi', 'Sweet') : 2576
('Butter', 'Sugar') : 2571
('Sugar', 'Milk') : 2563
('Yougurt', 'Coffee Powder') : 2555
('Panner', 'Bread') : 2550
('Butter', 'Sweet') : 2543
('Lassi', 'Milk') : 2539
('Sweet', 'Bread') : 2539
('Cheese', 'Yougurt') : 2532
('Cheese', 'Bread') : 2530
('Butter', 'Ghee') : 2530
('Butter', 'Yougurt') : 2529
('Sugar', 'Yougurt') : 2529
('Bread', 'Coffee Powder') : 2528
('Panner', 'Ghee') : 2523
('Coffee Powder', 'Milk') : 2518
('Cheese', 'Coffee Powder') : 2517
('Bread', 'Milk') : 2517
('Sugar', 'Ghee') : 2516
('Yougurt', 'Milk') : 2513
('Lassi', 'Coffee Powder') : 2512
('Sweet', 'Milk') : 2512
('Lassi', 'Ghee') : 2511
('Ghee', 'Milk') : 2511
('Bread', 'Yougurt') : 2507
('Lassi', 'Bread') : 2506
('Panner', 'Sugar') : 2505
('Panner', 'Sweet') : 2505
('Sweet', 'Ghee'