In [1]:
import itertools
import sys
import collections

In [2]:
def file_preprocessing(filename):
    f = open(filename, "r")
    candidate_itemset_dictionary = dict()
    gene_set_list = []
    for gene_row in f:
        gene_row = gene_row.strip("\n")
        gene_array = gene_row.split("\t")
        # print(gene_array)
        length_of_gene_array = len(gene_array)
        for i in range(length_of_gene_array):
            if(i != length_of_gene_array - 1):
                gene_array[i] = "G" + str(i+1) + "_" + gene_array[i]
            if (gene_array[i] in candidate_itemset_dictionary):
                candidate_itemset_dictionary[gene_array[i]] += 1
            else:
                candidate_itemset_dictionary[gene_array[i]] = 1
        gene_set_list.append(set(gene_array))
    return(candidate_itemset_dictionary, gene_set_list)

In [3]:
def candidate_itemset_generation(frequent_items, length):
    candidate_itemset = set()
    for i in range(len(frequent_items)-1):
        items1 = set(sorted(frequent_items[i].split(',')))
        for j in range(i+1, len(frequent_items)):
            items2 = set(sorted(frequent_items[j].split(',')))
            if(len(items1.intersection(items2))==length-2):
                ans = ','.join(sorted(items1.union(items2)))
                candidate_itemset.add(ans)
    return candidate_itemset


In [4]:
def frequent_itemset_generation(candidate_items, gene_set_list,length,support):    
    frequent_dict = dict()
    for candidate in candidate_items:
        candidate_set = set(candidate.split(','))
        count = 0
        for row in gene_set_list:
            if(candidate_set.issubset(row)):
                count+=1
        if count >= support:
            frequent_dict[candidate] = count
    return list(frequent_dict.keys())

In [5]:
def rules_generation(frequent_itemsets_dict, confidence, gene_set_data):
    rules_set = set()
    for i in range(len(frequent_itemsets_dict),1,-1):
        itemset_list = frequent_itemsets_dict[i]
        for itemset_string in itemset_list:
            itemset_list_set = set(itemset_string.split(","))
            if itemset_list_set not in rules_set:
                support_count_numerator = 0
                for gene_list_set in gene_set_data:
                    if itemset_list_set.issubset(gene_list_set):
                        support_count_numerator += 1
                if support_count_numerator:
                    itemset_string_list = itemset_string.split(",")
                    itemset_rules_generation(itemset_list_set,itemset_string_list, i-1, support_count_numerator,
                                      confidence, gene_set_data, rules_set)
    return rules_set

In [6]:
def itemset_rules_generation(itemset_list_set,itemset_list, item_max_size, support_count_numerator, confidence, gene_set_data, rules_set):
    for item_size in range(item_max_size, 0, -1):
        for combination in list(itertools.combinations(itemset_list, item_size)):
            head_list = list(set(combination).symmetric_difference(itemset_list_set))
            rule = ",".join(combination) + "->" + ",".join(head_list)
            if rule not in rules_set:
                support_count_denominator = 0
                for gene_list_set in gene_set_data:
                    if set(combination).issubset(gene_list_set):
                        support_count_denominator += 1
                confidence_for_rule = (support_count_numerator / support_count_denominator) * 100
                if (confidence_for_rule >= confidence):
                    rules_set.add(rule)
    return rules_set

In [7]:
def general_template_helper(template_number,rules_set):
    if(template_number == "1"):
        rhb = input("Enter the first parameter - RULE|HEAD|BODY: ").upper()
        ann = input("Enter the second parameter - ANY|NONE|NUMBER: ").upper()
        itemset = input("Enter the itemset - ITEM1,ITEM2,..,ITEMN: ").strip().split(',');
        rules_selected = template1_helper(rhb,ann,itemset,rules_set)
        return rules_selected
    
    elif(template_number == "2"):
        rhb = input("Enter the first parameter - RULE|HEAD|BODY: ").upper()
        number = int(input("Enter the second parameter - Any valid number: "))
        rules_selected = template2_helper(rhb,number,rules_set)
        return rules_selected
    
    elif(template_number == "3"):
        operator = input("Enter the paramater for template 3: ")
        if "or" in operator.lower():
            template_set = operator.lower().split("or")
            if template_set[0]=="1":
                rule_selected1 = general_template_helper("1",rules_set)
            elif template_set[0]=="2":
                rule_selected1 = general_template_helper("2",rules_set)
            if template_set[1]=="1":
                rule_selected2 = general_template_helper("1",rules_set)
            elif template_set[1]=="2":
                rule_selected2 = general_template_helper("2",rules_set)
            rules_selected = rule_selected1.union(rule_selected2)
        elif "and" in operator.lower():
            template_set = operator.lower().split("and")
            if template_set[0]=="1":
                rule_selected1 = general_template_helper("1",rules_set)
            elif template_set[0]=="2":
                rule_selected1 = general_template_helper("2",rules_set)
            if template_set[1]=="1":
                rule_selected2 = general_template_helper("1",rules_set)
            elif template_set[1]=="2":
                rule_selected2 = general_template_helper("2",rules_set)
            rules_selected = rule_selected1.intersection(rule_selected2)
        return rules_selected
        
                
def template1_helper(rhb,ann,itemset,rules_set):
    rules_selected = set()
    for rule in rules_set:
        head_set = set(rule.split('->')[0].split(','))
        body_set = set(rule.split('->')[1].split(','))
        final_set = set()
        if rhb == "RULE":
            final_set = head_set.union(body_set)
        elif rhb == "BODY":
            final_set = body_set
        elif rhb == "HEAD":
            final_set = head_set

        if ann == "ANY" and len(set(itemset).intersection(final_set)) > 0:
            rules_selected.add(rule)
        elif ann == "NONE" and len(set(itemset).intersection(final_set)) == 0:
            rules_selected.add(rule)
        elif ann.isdigit() and len(set(itemset).intersection(final_set)) == int(ann):
            rules_selected.add(rule)
    return rules_selected

def template2_helper(rhb,number,rules_set):
    rules_selected=set()
    for rule in rules_set:
        head_set = set(rule.split('->')[0].split(','))
        body_set = set(rule.split('->')[1].split(','))
        final_set = set()
        if rhb == "RULE":
            final_set = head_set.union(body_set)
        elif rhb == "BODY":
            final_set = body_set
        elif rhb == "HEAD":
            final_set = head_set   
        if(len(final_set)>= number):
            rules_selected.add(rule)
    return rules_selected

    

In [8]:
def mainFunction():
#     filename = "/Users/shashank/Downloads/association-rule-test-data.txt"
#     support =50
#     confidence = 70
    filename = input("Enter the filename including location: ")
    support = int(input("Enter Support Threshold: "))
    length = 1
    candidate_itemset_dictionary, gene_set_list = file_preprocessing(filename)
    # print(candidate_itemset_dictionary)
    frequent_itemset_dictionary = dict()
    frequent_itemset_dictionary[1] = []
    for items in candidate_itemset_dictionary:
        if candidate_itemset_dictionary[items] >= (support/len(gene_set_list)*100):
            frequent_itemset_dictionary[1].append(items)
#     print(frequent_itemset_dictionary[1])
    frequent_itemset = frequent_itemset_dictionary[1]
    while len(frequent_itemset) > 0:
        length += 1
        candidate_itemset = candidate_itemset_generation(frequent_itemset_dictionary[length-1],length)
        frequent_itemset = frequent_itemset_generation(candidate_itemset,gene_set_list,length,support)
        frequent_itemset_dictionary[length] = frequent_itemset
    
    ansLength = 1
    totalItems = 0;
    for frequent_items in frequent_itemset_dictionary:
        print("The number of frequent items of length",ansLength,"are: ",len(frequent_itemset_dictionary[frequent_items]))
        ansLength += 1
        totalItems +=len(frequent_itemset_dictionary[frequent_items]) 
    print("The number of all length frequent items are: ",totalItems)
    
    confidence = int(input("Enter Confidence threshold: "))
    rules_set = rules_generation(frequent_itemset_dictionary, confidence, gene_set_list)
    print("Total number of rules for support " +str(support)+ "% and confidence " +str(confidence)+ "% : "+str(len(rules_set)))
    template_number = input("Enter the Template number: ")
    remaining_rules = general_template_helper(template_number,rules_set)
    print("The number of rules for template",template_number,"is",len(remaining_rules))
    print(remaining_rules)
    
     
        
        

In [9]:
mainFunction()

Enter the filename including location: association-rule-test-data.txt
Enter Support Threshold: 50
The number of frequent items of length 1 are:  109
The number of frequent items of length 2 are:  63
The number of frequent items of length 3 are:  2
The number of frequent items of length 4 are:  0
The number of all length frequent items are:  174
Enter Confidence threshold: 70
Total number of rules for support 50% and confidence 70% : 117
Enter the Template number: 2
Enter the first parameter - RULE|HEAD|BODY: body
Enter the second parameter - Any valid number: 1
The number of rules for template 2 is 117
{'G72_Up->G59_Up', 'G59_Up->G96_Down', 'G13_Down->G82_Down', 'G82_Down->G13_Down', 'G52_Down->G38_Down', 'G41_Down->G38_Down', 'G82_Down->G97_Down', 'G38_Down->G70_Down', 'G24_Down->G88_Down', 'G24_Down->G54_Up', 'G38_Down->G47_Up', 'G88_Down->G8_Up', 'G59_Up,G96_Down->G72_Up', 'G13_Down->G28_Down', 'G47_Up->G28_Down', 'G54_Up->G88_Down', 'G38_Down->G1_Up', 'G88_Down->G38_Down', 'G1_Up->