In [7]:
# Import the required libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, random
from copy import copy, deepcopy
import sys
from collections import Counter
import queue
from scipy import stats
from sklearn.metrics import precision_recall_fscore_support
np.random.seed(21)
random.seed(21)

In [8]:
fz = frozenset

In [5]:
def compute_entropy(labels):
    entropy = 0.0
    totSamples = len(labels)
    labelSet = set(labels.reshape(-1))
    for label in labelSet:
        prob = np.sum(labels == label) / totSamples
        if prob > 1e-12:
            entropy -= np.log(prob) * prob

    return entropy

In [38]:
def get_entropy(mask, labels):
    attr_split_info = 0
    attr_count = dict()
    for attr_val in set(data_i.reshape(-1)):
        ids = np.where(data_i == attr_val)[0]
        attr_count[attr_val] = len(ids)
        attr_split_info += attr_count[attr_val] * compute_entropy(labels[ids])
    return attr_split_info

In [18]:
def getBestRuleCN2(all_cond,dataset,labels):
    # aim :to reduce entropy. ie choose the one with minimum entropy.
    # also gain must be positive. so the weighted entropy must be lesser than old entropy.
    # condition of frm (idx, val) and evaluated at dataset[:,idx] == val
    min_significant = 10
    max_rules_count = 10 # to use in beam search
    old_entopy = compute_entropy(labels)
    min_entropy = float('inf')
    best_rule_set = None
    best_rule_mask = None
    candidates = {} # candidates are of the form {ruleset : (entropy,mask)} 
    # initial ruleset empty, inital mask: all true initial entropy: old entropy
    # Done: give initial candidates
    emp_set = set()
    emp_set = fz(emp_set)
    all_true_mask = np.asarray([True]*dataset.shape[0],dtype=np.bool)
    candidates[emp_set] = (old_entopy, all_true_mask) 
    
    while len(candidates)!= 0:
        next_candidates = dict()
        for rule_fz, tup in candidates.items():
            rule_set = set(rule_set)
            rule_mask = tup[1]
            rule_entropy = tup[0]
            rule_attr = set([cond[0] for cond in rule_set])
            for new_cond in all_cond:
                if new_cond[0] in rule_attr: # preventing collisions /inconsistencies {A_i = v1, A_i = v2}
                    continue
                new_rule_mask = rule_mask & dataset[:,new_cond[0]] == new_cond[1]
                if np.sum > min_significant:  # checking significance
                    new_rule_entropy = get_entropy(new_rule_mask,labels)
                    new_rule = deepcopy(rule_set)
                    new_rule.add(new_cond)
                    new_rule = fz(new_rule)
                    next_candidates[new_rule] = (new_rule_entropy, new_rule_mask) # map takes care of dupelicates
                    if(new_rule_entropy < min_entropy): # is this right? can entropy decrease later? yes hence beam search. corrected.
                        min_entropy = new_rule_entropy
                        best_rule_set = new_rule
                        best_rule_mask = new_rule_mask
                        # add to next candidates, check for best
        sort_next_candidates = [(k, d[k]) for k in sorted(d, key=d.get)]
        sort_next_candidates = sort_next_candidates[:max_rules_count]
        candidates= {}
        for k, v in sort_next_candidates:
            candidates[k] = v
    if min_entropy < old_entopy:
        return best_rule_set, best_rule_mask
    else:
        return None
    
    
    

In [13]:
def CN2_train(dataset,labels,all_conds):
    # ordered rules is followed
    rule_list = []
    dataset = deepcopy(dataset)
    labels = deepcopy(labels)
    while(len(labels.reshape(-1)) != 0):
        next_rule_set = getBestRuleCN2(all_conds, dataset, labels)
        if next_rule_set is None:
            break
        
        to_delete = next_rule_set[1]
        to_keep = (to_delete == False)
        delete_labels = labels[to_delete]
        majority_class = Counter(delete_labels.reshape(-1)).most_common(1)[0][0]
        rule_list.append((next_rule_set[0],majority_class))
        dataset = dataset[to_keep]
        labels = labels[to_keep]
    return rule_list
        

In [14]:
def get_to_check_tuples(rile_list):
    to_check_tuples = []
    for rule,maj_class in rule_list:
        attr_list = np.asarray([d[0] for d in rule], dtype=np.int32)
        val_list = np.asarray([d[1] for d in rule])
        to_check_tuples.append( (attr_list, val_list, maj_class))
    return to_check_tuples
        
            

In [15]:
def CN2_infer(dataset,rule_list, default_class = -1):
    labels = np.zeros((dataset.shape[0],), dtype = np.int32)
    labels += default_class;
    to_check_tuples = get_to_check_tuples(rule_list)
    for idx, sample in enumerate(dataset):
        for attr, val, maj_cls in to_check_tuples:
            if np.all(sample[attr] == val):
                labels[idx] = maj_cls
                break;
    return labels
  