In [2]:
import numpy as np
import pandas as pd
import math

In [3]:
# filters = np.random.randint(2, size= (1000, 100))
# filters = pd.read_csv(r"C:\Users\joles\Downloads\ex_filter_set (1).csv").drop('Unnamed: 0', axis = 1)
# filters

In [14]:

class Node:
    '''
    Multi-criterion decision tree to search the filter set.
    '''
    

    def __init__(self, theta:pd.DataFrame, depth = None, max_depth = None, num_filters = None):
        
        self.num_filters = num_filters if num_filters else 0
        # self.min_filters = min_filters if min_filters else 0

        self.depth = depth if depth else 0
        
        #Max depth = size of subset
        self.max_depth = max_depth if max_depth else 1

        #initialize left and right node to be empty
        self.left = None
        self.right = None

        #filter set
        self.theta = theta
        
    def calc_weights(self, filters, depth):
        d = depth
        w1 = sum(filters[d])/(filters[d].shape[0])
        w2 = 1 - w1
        return w1, w2
    
    #returns the 'average' filter
    def GET_avg(self):
        sums = np.array([self.theta[col].sum() for col in self.theta.columns])
        return sums/(self.theta.shape[0])

    #calculate entropy on split element
    @staticmethod
    def calc_entropy(df):
        
        size = df.shape[0]
        props = [((df[col] == 0).sum(), (df[col] == 1).sum()) for col in df.columns]

        s = 0
        for prop in props:
            if prop[0]*prop[1] > 0:
                s = s + (prop[0]/size*np.log(prop[0]/size)) + (prop[1]/size*np.log(prop[1]/size))
        return -s 
    

    # information gain for split element
    
    def info_gains(self, data):
        size = data.shape[0]
        entropy_before = self.calc_entropy(data)
        info_gains = []
        for col in data.columns:
            left, right = data[data[col] == 0], data[data[col] == 1]
            left_size, right_size = left.shape[0], right.shape[0]
            entropy_after = (left_size/size)*self.calc_entropy(left) + (right_size/size)*self.calc_entropy(right)
            info_gains.append(entropy_before - entropy_after)
        return info_gains

    
    def build_tree(self):

        filters = self.theta

        self.split_elem = filters.columns[np.argmax(self.info_gains(filters))]
        
        l_split = filters[filters[self.split_elem] == 0]
        r_split = filters[filters[self.split_elem] == 1]   

        if (self.depth < self.max_depth) and (self.num_filters > 1):

            
            left = Node(l_split,
                        self.depth + 1,
                        self.max_depth,
                        num_filters = l_split.shape[0])
            
            self.left = left
            left.build_tree()

            right = Node(r_split,
                        self.depth + 1,
                        self.max_depth,
                        num_filters = r_split.shape[0])
             
            self.right = right
            right.build_tree()

    
    #Next three methods involve choosing correct branch based on Gaussian approach on 'n' iid filters. 
    #Filters may not be iid. Need to find a general method for choosing correct branch?
    def calc_mean_distance(self, subset, weights):
        return sum([weights[i] * (self.GET_avg()[i]+(1-2*self.GET_avg()[i])*subset[i]) for i in range(len(subset))])

    def calc_variance(self, weights):
        return sum([(weights[i]**2) * self.GET_avg()[i]*(1-self.GET_avg()[i]) for i in range(len(self.GET_avg()))])

    def choose_branch(self, subset, weights):

        #distance to average filter
        left_avg = self.left.calc_mean_distance(subset=subset, weights=weights)
        right_avg = self.right.calc_mean_distance(subset=subset, weights=weights)

        #calculate variance of distance
        left_var = self.left.calc_variance(weights=weights)
        right_var = self.right.calc_variance(weights=weights)

        #calculate minimum expected distance to average filter
        ex_distance_left = left_avg - pow(left_var, 0.5)*math.sqrt(2*np.log(self.left.theta.shape[0]))
        ex_distance_right = right_avg - pow(right_var, 0.5)*math.sqrt(2*np.log(self.right.theta.shape[0]))

        if ex_distance_left <= ex_distance_right:
            return self.left
        else:
            return self.right

        # return min(ex_distance_left , ex_distance_right)
        
    def traverse(self, subset, weights):
        while self.depth < self.max_depth:
            self = self.choose_branch(subset, weights)

        temp_min = 1000000000
        temp_filter = np.zeros(self.theta.shape[1])

        for i in range(self.theta.shape[0]):
            dist = sum(weights*abs(self.theta.iloc[i] - subset))
            if dist < temp_min:
                temp_min = dist
                temp_filter = self.theta.iloc[i]

        print("Best Matched Filter: \n{}\n  with weighted L1 distance of: {}".format(temp_filter, temp_min) )

In [79]:
theta = pd.DataFrame([[0, 1, 1, 0],
                     [0, 0, 1, 1],
                     [1, 1, 0, 0],
                     [1, 1, 1, 0],
                     [0, 1, 1, 1]])
subset = np.array([0,1,0,0])
weights = np.array([0.5,1,1,1])


theta2 = pd.DataFrame([[1, 1, 0, 0, 0],
                        [1, 1, 1, 0, 0],
                        [0, 1, 1, 0, 0],
                        [0, 1, 1, 1, 0],
                        [0, 0, 0, 1, 1],
                        [0, 0, 1, 1, 0],
                        [0, 0, 1, 1, 1]])

subset2 = np.array([0, 1, 1, 1, 1])
weights2 = np.array([1, 0.5, 1, 1, 0.3])

root = Node(theta = theta2, max_depth = 1, num_filters=theta.shape[0])
root.build_tree()
root.traverse(subset2, weights2)