In [130]:
import numpy as np
import pandas as pd
import math

In [154]:
# filters = np.random.randint(2, size=(1000, 100))
filters = pd.read_csv(r"C:\Users\joles\Downloads\ex_filter_set (1).csv").drop('Unnamed: 0', axis = 1)
filters

Unnamed: 0,s1,s2,s3,s4,s5,s6
0,1,0,1,0,1,0
1,0,1,0,1,0,0
2,1,1,1,1,0,0
3,1,0,0,0,1,1
4,0,1,0,1,1,0
5,1,0,0,0,1,0
6,1,1,1,1,1,0
7,1,1,1,1,1,1
8,0,0,1,0,0,1
9,0,1,0,0,0,1


In [260]:

class Node:
    '''
    Multi-criterion decision tree to search the filter set.
    '''
    

    def __init__(self, theta:pd.DataFrame, depth = None, max_depth = None, num_filters = None):
        
        self.num_filters = num_filters if num_filters else 0
        # self.min_filters = min_filters if min_filters else 0

        self.depth = depth if depth else 0
        
        #Max depth = size of subset
        self.max_depth = max_depth if max_depth else 0

        #initialize left and right node to be empty
        self.left = None
        self.right = None

        #filter set
        self.theta = theta
        
    def calc_weights(self, filters, depth):
        d = depth
        w1 = sum(filters[d])/(filters[d].shape[0])
        w2 = 1 - w1
        return w1, w2
    
    #returns the 'average' filter
    def GET_avg(self):
        sums = np.array([self.theta[col].sum() for col in self.theta.columns])
        return sums/(self.theta.shape[0])

    #calculate entropy on split element
    @staticmethod
    def calc_entropy(df):
        
        size = df.shape[0]
        props = [((df[col] == 0).sum(), (df[col] == 1).sum()) for col in df.columns]

        s = 0
        for prop in props:
            if prop[0]*prop[1] > 0:
                s = s + (prop[0]/size*np.log(prop[0]/size)) + (prop[1]/size*np.log(prop[1]/size))
        return -s 
    

    # information gain for split element
    
    def info_gains(self, data):
        size = data.shape[0]
        entropy_before = self.calc_entropy(data)
        info_gains = []
        for col in data.columns:
            left, right = data[data[col] == 0], data[data[col] == 1]
            left_size, right_size = left.shape[0], right.shape[0]
            entropy_after = (left_size/size)*self.calc_entropy(left) + (right_size/size)*self.calc_entropy(right)
            info_gains.append(entropy_before - entropy_after)
        return info_gains

    
    def build_tree(self):

        filters = self.theta

        self.split_elem = filters.columns[np.argmax(self.info_gains(filters))]
        
        l_split = filters[filters[self.split_elem] == 0]
        r_split = filters[filters[self.split_elem] == 1]   

        if (self.depth < self.max_depth) and (self.num_filters > 1):

            
            left = Node(l_split,
                        self.depth + 1,
                        self.max_depth,
                        num_filters = l_split.shape[0])
            
            self.left = left
            left.build_tree()

            right = Node(r_split,
                        self.depth + 1,
                        self.max_depth,
                        num_filters = r_split.shape[0])
             
            self.right = right
            right.build_tree()

    def calc_mean_distance(self, subset, weights):
        return sum([weights[i] * (self.GET_avg()[i]+(1-2*self.GET_avg()[i])*subset[i]) for i in range(len(subset))])

    def calc_variance(self, weights):
        return sum([(weights[i]**2) * self.GET_avg()[i]*(1-self.GET_avg()[i]) for i in range(len(self.GET_avg()))])

    def choose_branch(self, subset, weights):

        #distance to average filter
        left_avg = self.left.calc_mean_distance(subset=subset, weights=weights)
        right_avg = self.right.calc_mean_distance(subset=subset, weights=weights)

        #calculate variance of distance
        left_var = self.left.calc_variance(weights=weights)
        right_var = self.right.calc_variance(weights=weights)

        #calculate minimum expected distance to average filter
        ex_distance_left = left_avg - pow(left_var, 0.5)*math.sqrt(2*np.log(self.left.theta.shape[0]))
        ex_distance_right = right_avg - pow(right_var, 0.5)*math.sqrt(2*np.log(self.right.theta.shape[0]))

        return min(ex_distance_left , ex_distance_right)

In [261]:
theta = pd.DataFrame([[0, 1, 1, 0], [0, 0, 1, 1], [1, 1, 0, 0], [1, 1, 1, 0], [0, 1, 1, 1]])

root = Node(theta = theta, max_depth = 1, num_filters=theta.shape[0])
root.build_tree()

In [247]:
subset = np.array([0, 1, 0, 0])
weights = np.array([0.5, 1, 1, 1])
l_avg = root.left.GET_avg()
r_avg = root.right.GET_avg()
r_avg[3]

0.0

In [251]:
#heuristic approach based on gaussian approximation
mu_d_s11 = sum([weights[i] * (r_avg[i]+(1 - 2*r_avg[i])*subset[i]) for i in range(len(subset))])
mu_d_s10 = sum([weights[i] * (l_avg[i]+(1 - 2*l_avg[i])*subset[i]) for i in range(len(subset))])

var_s11 = sum([(weights[i]**2) * r_avg[i]*(1-r_avg[i]) for i in range(len(subset))])
var_s10 = sum([(weights[i]**2) * l_avg[i]*(1-l_avg[i]) for i in range(len(subset))])
sigma_s11 = pow(var_s11, 0.5)
sigma_s10 = pow(var_s10, 0.5)
sigma_s10

#approximate estimation of 'n' independent identically distributed distances
E_min_s10 = mu_d_s10 - sigma_s10* math.sqrt(2*np.log(root.left.theta.shape[0]))
E_min_s11 = mu_d_s11 - sigma_s11* math.sqrt(2*np.log(root.right.theta.shape[0]))
var_s11

0.25

In [264]:
root.left.choose_branch(subset=subset, weights=weights)

AttributeError: 'NoneType' object has no attribute 'calc_mean_distance'

In [256]:
left_avg = root.left.calc_mean_distance(subset=subset, weights=weights)
right_avg = root.right.calc_mean_distance(subset=subset, weights=weights)
left_var = root.left.calc_variance(weights=weights)
right_var = root.right.calc_variance(weights=weights)

ex_distance_left = left_avg - pow(left_var, 0.5)*math.sqrt(2*np.log(root.left.theta.shape[0]))
ex_distance_right = right_avg - pow(right_var, 0.5)*math.sqrt(2*np.log(root.right.theta.shape[0]))
ex_distance_right

0.41129498874226267

In [257]:
def choose_branch(self, subset, weights):
    left_avg = root.left.calc_mean_distance(subset=subset, weights=weights)
    right_avg = root.right.calc_mean_distance(subset=subset, weights=weights)
    left_var = root.left.calc_variance(weights=weights)
    right_var = root.right.calc_variance(weights=weights)
    ex_distance_left = left_avg - pow(left_var, 0.5)*math.sqrt(2*np.log(root.left.theta.shape[0]))
    ex_distance_right = right_avg - pow(right_var, 0.5)*math.sqrt(2*np.log(root.right.theta.shape[0]))

    return min(ex_distance_left, ex_distance_right)

In [259]:
choose_branch(root, subset=subset, weights=weights)

0.41129498874226267