In [1]:
import numpy as np
import pandas as pd

from ordinor.io import read_disco_csv

from ordinor.execution_context.rule_based import AtomicRule, Rule
from ordinor.execution_context.rule_based import NumericRuleGenerator, CategoricalRuleGenerator

num_rule_gen = NumericRuleGenerator.HistogramSplit
cat_rule_gen = CategoricalRuleGenerator.RandomTwoSubsetPartition

from ordinor.execution_context.rule_based import impurity, _dispersal_event_pdist, _dispersal_node_pdist

KeyboardInterrupt: 

In [None]:
el = read_disco_csv('data/processed/wabo.csv')
el

In [None]:
# specification

all_cand_attrs = [
    {'attr': 'ct:channel', 'attr_type': 'categorical', 'attr_dim': 'CT'},
    #{'attr': 'concept:name', 'attr_type': 'categorical', 'attr_dim': 'AT'},
    {'attr': 'tt:weekday', 'attr_type': 'categorical', 'attr_dim': 'TT'}, 
    {'attr': 'tt:ampm', 'attr_type': 'categorical', 'attr_dim': 'TT'},
]

all_cand_attrs

In [None]:
# initialize data structures for scoring 

# event-resource (CONSTANT)
m_event_r = el['org:resource'].copy()

# event-node: all events belong to the same node (root) at the beginning
m_event_node = pd.Series([0] * len(el), index=el.index)

# node-ct/at/tt: empty at the beginning
m_node_t = dict()

# label generators
from itertools import count

gen_node_label = count(start=1, step=1)
gen_ct_label = count(start=1, step=1)
gen_at_label = count(start=1, step=1)
gen_tt_label = count(start=1, step=1)

In [None]:
# global variables and queue

initial_impurity = impurity(m_event_node, m_event_r)
print(f'Initial impurity =\t{initial_impurity}')

val_impurity = initial_impurity
val_dispersal = 0.0
val_target = -np.inf

dims = set()

# set up root node and queue for iterative search
root = {
    'event_ids': el.index, 'rule': AtomicRule(), 
    'node_id': 0, 'CT': 0, 'AT': 0, 'TT': 0, 
}
nodes = [root]
m_node_t[root['node_id']] = {'CT': root['CT'], 'AT': root['AT'], 'TT': root['TT']}

initial_dispersal2 = _dispersal_node_pdist(pd.DataFrame.from_dict(m_node_t, orient='index'), m_event_node, m_event_r)
print(f'Initial dispersal2 =\t{initial_dispersal2}')


In [None]:
# evaluation block
# evaluate candidate splits by applying the rules
def evaluate_split(node, cand_rules, attr_dim, log):
    # each rule is one of the partitions from a candidate split
    
    #print(f"Evaluating {node}")
    
    # make indepedent copies for evaluation
    cand_m_event_node = m_event_node.copy()
    cand_m_node_t = m_node_t.copy()
    
    del cand_m_node_t[node['node_id']]

    for rule in cand_rules:      
        par = rule.apply(log, index_only=True)

        next_node_label = next(gen_node_label)
        cand_m_event_node.loc[par] = next_node_label
        
        # inherit from parent node
        cand_m_node_t[next_node_label] = {'CT': node['CT'], 'AT': node['AT'], 'TT': node['TT']}
        
        # change depending on the rule applied
        if attr_dim == 'CT':
            next_ct_label = next(gen_ct_label)
            cand_m_node_t[next_node_label]['CT'] = next_ct_label
        elif attr_dim == 'AT':
            next_at_label = next(gen_at_label)
            cand_m_node_t[next_node_label]['AT'] = next_at_label
        elif attr_dim == 'TT':
            next_tt_label = next(gen_tt_label)
            cand_m_node_t[next_node_label]['TT'] = next_tt_label
        else:
            raise ValueError
        
        #print(f"Create new node {cand_m_node_t[next_node_label]}")

    imp = impurity(cand_m_event_node, m_event_r)

    #dis = _dispersal_event_pdist(m_event_ct=(cand_m_event_ct if 'CT' in cand_dims else None), m_event_at=(cand_m_event_at if 'AT' in cand_dims else None), m_event_tt=(cand_m_event_tt if 'TT' in cand_dims else None), m_event_r=m_event_r)
    
    dis2 = _dispersal_node_pdist(m_co_t=pd.DataFrame.from_dict(cand_m_node_t, orient='index'), m_event_co=cand_m_event_node, m_event_r=m_event_r)
    
    return imp, dis2

In [None]:
# define optimization goal (Reduction Ratio; the amount of reduction - the larger the better)
def f_target(delta_dis, base_dis, delta_imp, base_imp):
    if base_dis == 0:
        rr_dis = 1
    else:
        rr_dis = delta_dis / base_dis
        
    rr_imp = delta_imp / base_imp
    return -1 * ( rr_dis + rr_imp )

In [None]:
# find the next node from the node queue (best-first)

def find_next_node(q):
    if len(q) > 0:
        curr_target = -np.inf
        index = None
        for i, node in enumerate(q):
            split = find_split(node)
            if split is None:
                continue
            target = split['target']
            if target > curr_target:
                curr_target = target
                index = i
            
        if curr_target > val_target:
            node = q[index]
            del q[index]
            return node
        else:
            None
    else:
        return None

In [None]:
# find a split on a single node

def find_split(node):
    log = el.loc[node['event_ids']]
    
    results_cand_attrs = []
    
    for x in all_cand_attrs:
        attr = x['attr']
        attr_type = x['attr_type']
        attr_dim = x['attr_dim']
        
        results = {
            'attr': attr, 'attr_type': attr_type, 'attr_dim': attr_dim,
        }
        
        cand_rules = None
        
        # generate candidate splits
        if attr_type == 'numeric':
            cand_rules = num_rule_gen(attr, attr_dim, el)
        else:
            # evaluate over a sample of all possible two-subset partitioning (x% of 2^(N-1) - 1 possibilities)
            curr_target = -np.inf
            curr_rules = None
            for i, rules in enumerate(cat_rule_gen(attr, attr_dim, log, n_sample=1)):
                imp, dis = evaluate_split(node, rules, attr_dim, log)
                delta_imp = imp - val_impurity
                delta_dis = dis - val_dispersal
                target = f_target(delta_dis, val_dispersal, delta_imp, val_impurity)
                # find the candidate to maximize target
                if target > curr_target:
                    curr_rules = rules
                    curr_target = target
            if curr_target > val_target:
                cand_rules = curr_rules
            
        if cand_rules is None:
            # skip attribute if no candidate split can be generated
            continue
        
        #print(f'Cand. rules to apply on node-{node["node_id"]}:\n\t{cand_rules}')
        results['rules'] = cand_rules
        
        # evaluate
        imp, dis = evaluate_split(node, cand_rules, attr_dim, log)
        # NOTE: delta(y) = y_{t+1} - y_{t}
        # delta_impurity is expected to be negative (decreasing)
        delta_imp = imp - val_impurity
        #print('Δimp\t\t=\t{:.4f}'.format(delta_imp))
        # delta_dispersal is expected to be positive (increasing)
        delta_dis = dis - val_dispersal
        #print('Δdis\t\t=\t{:.4f}'.format(delta_dis))
        
        results['delta_impurity'] = delta_imp
        results['delta_dispersal'] = delta_dis
        
        target = f_target(delta_dis, val_dispersal, delta_imp, val_impurity)
        results['target'] = target
        
        results_cand_attrs.append(results)
    
    # select the attribute whose split can lead to the largest target (Reduction Ratio)
    if len(results_cand_attrs) > 0:
        return sorted(results_cand_attrs, key=lambda x: x['target'])[-1]
    else:
        return None

In [None]:
# main procedure

eps = 0.1

while True:
    node = find_next_node(nodes)
    #print(node)
    
    if node is None:
        print('No next node to be split on')
        print('Procedure stops.')
        break
    split = find_split(node)
    
    if split is None:
        print("No split on this node could be found")
        print(f"No further expansion from leaf node-{node['node_id']}")
        continue
    else:   
        print(f"Split node-{node['node_id']} on attribute `{split['attr']}` ({split['attr_dim']})")
        
    dims.add(split['attr_dim'])
    log = el.loc[node['event_ids']]
    
    for rule in split['rules']:
        # apply selected split
        
        par = rule.apply(log, index_only=True)
        
        child_node_label = next(gen_node_label)
        
        # inherit from parent node
        m_node_t[child_node_label] = {'CT': node['CT'], 'AT': node['AT'], 'TT': node['TT']}
        
        m_event_node.loc[par] = child_node_label
        attr_dim = split['attr_dim']
        if attr_dim == 'CT':
            ct_label = next(gen_ct_label)
            m_node_t[child_node_label]['CT'] = ct_label
        elif attr_dim == 'AT':
            at_label = next(gen_at_label)
            m_node_t[child_node_label]['AT'] = at_label
        elif attr_dim == 'TT':
            tt_label = next(gen_tt_label)
            m_node_t[child_node_label]['TT'] = tt_label
        else:
            raise ValueError
        
        child_node = {
            'event_ids': par, 
            'rule': rule, 
            'node_id': child_node_label, 
            'CT': m_node_t[child_node_label]['CT'], 
            'AT': m_node_t[child_node_label]['AT'], 
            'TT': m_node_t[child_node_label]['TT'], 
        }
        nodes.append(child_node)
    
    del m_node_t[node['node_id']]
    
    print('Reduction Ratio\t\t=\t{:.3%}'.format(split['target']))
    
    val_impurity += split['delta_impurity']
    val_dispersal += split['delta_dispersal']
    
    print('Dispersal\t\t=\t{:.3f}'.format(val_dispersal))
    print('Impurity\t\t=\t{:.3f}'.format(val_impurity))