In [1]:
import operator as op
import numpy as np 
import sklearn as skl 
import sklearn.tree as tree 

In [2]:
def estimate_gini_impurity(feature_values, threshold, labels, polarity):

    # TODO start
    filtered_features = polarity(feature_values, threshold)
    f = feature_values[filtered_features] # features

    if len(f) == 0: 
        gini_impurity = 1
        return gini_impurity

    l = labels[filtered_features] 

    class1_members = np.logical_and(f, l == +1)
    class2_members = np.logical_and(f, l == -1)

    class1_length = sum(class1_members) # length of c1 members
    class2_length = sum(class2_members) # length of c2 members

    class_length = class1_length + class2_length  # total length of both classes

    pr1 = class1_length / class_length # probability of class 1
    pr2 = class2_length / class_length # probability of class 2

    gini_impurity = (pr1 * (1 - pr1)) + (pr2 * (1 - pr2)) # calculate gini impurity
    # TODO end

    return gini_impurity

In [3]:
def estimate_gini_impurity_expectation(feature_values, threshold, labels):

    # TODO start
    left_gini  = estimate_gini_impurity(feature_values, threshold, labels, op.le)
    right_gini = estimate_gini_impurity(feature_values, threshold, labels, op.gt)

    filtered_features = op.le(feature_values, threshold)

    number = len(feature_values[filtered_features])
    length = len(feature_values)
    pr1 = number/length
    
    expectation = left_gini * pr1 + right_gini * (1 - pr1)
    # TODO end

    return expectation

In [4]:
def midpoint(x):

    result = (x[1:] + x[:-1]) / 2
    
    return result

In [5]:
def grid_search_split_midpoint(X, y): 

    X_sorted = np.sort(X, axis=0)
    thresholds = np.apply_along_axis(midpoint, 0, X_sorted)

    # TODO start
    columns = X.shape[1]
    rows    = X.shape[0]-1
    grid    = [[0 for i in range(columns)] for j in range(rows)]
    
    for col in range(columns):
        for row in range(rows):
            grid[row][col] = estimate_gini_impurity_expectation(X[:,col], thresholds[row][col], y)
            
    arr = np.asarray(grid)
    ind = np.unravel_index(np.argmin(arr, axis=None), arr.shape)
    best_threshold = arr[ind]
    best_feature = ind[1]
    # TODO end
    

    return grid, best_feature, best_threshold 

In [None]:
def test_egi():
    