In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("Iris.csv")
df = df.drop("Id", axis=1)
df = df.rename(columns={"species": "label"})

In [5]:
# regressor = DecisionTreeClassifier(min_samples_leaf=15, min_samples_split=100)
# regressor.fit(df.iloc[:, :-1].values, df.iloc[:, -1].values)

In [6]:
# regressor.tree_.feature

In [7]:
# regressor.tree_.threshold

In [8]:
# df.head()

In [3]:
def get_potential_splits(data):
    """
    Get all potential splits for each feature
    Splits can be made on each unique value
    Can essentially make a split at each unique value

    """
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1): # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)

        potential_splits[column_index] = unique_values

    return potential_splits

In [3]:
def f_gini(data):
    label_column = data[:, -2]
    data_sample_weight =  data[:, -1]
    _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
    # Get summed weights for each class
    class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])

    weighted_classes = counts * class_weights
    normalized_weighted_classes = weighted_classes / sum(weighted_classes)
    #return 1.0 - (np.sum(counts**2) / np.sum(class_weights)**2)
    return 1.0 - sum(normalized_weighted_classes**2)

In [5]:
def gini_sk(data):
    if len(data) <= 0:
        return None
    label_column = data[:, -2]
    data_sample_weight =  data[:, -1]
    _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
    #print(f"labels: {label_column}")
    #print(f"For jizz: {[np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]) for value_index in value_indexes]}")
    class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])
#     cw = np.sum(class_weights**2)
#     wn = np.sum(data_sample_weight)**2

    #gini = 1.0 - (cw/wn)
    cw = np.sum(class_weights**2)
    wn = np.sum(data_sample_weight)**2
    gini = 1.0 - (cw/wn)
    print(f"cw: {cw}, wn: {wn} ---> 1.0 - (cw/wn) == {1.0} - {cw}/{wn}")
    return gini

In [4]:
def calculate_entropy(data_group):
    if len(data) <= 0:
        return None
    _, counts = np.unique(data_group[:, -2], return_counts=True)
    
    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    
    return entropy

In [6]:
# def gini(data):
#     label_column = data[:, -2]
#     data_sample_weight =  data[:, -1]
#     _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
#     # Get summed weights for each class
#     class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])

#     weighted_classes = counts * class_weights
#     normalized_weighted_classes = weighted_classes / sum(weighted_classes)
#     #return 1.0 - (np.sum(counts**2) / np.sum(class_weights)**2)
#     #return 1.0 - sum(normalized_weighted_classes**2)
    
#     #_, value_indexes, counts = np.unique(data, return_counts=True, return_index=True)
#     gini = len(counts)
#     for count in counts:
#         gini *= (count/len(data))
#     return gini

In [6]:
def gini(data):
    label_column = data[:, -2]
    data_sample_weight =  data[:, -1]
    _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
    # Get summed weights for each class
    class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])

    weighted_classes = counts * class_weights
    normalized_weighted_classes = weighted_classes / sum(weighted_classes)
    #return 1.0 - (np.sum(counts**2) / np.sum(class_weights)**2)
    #return 1.0 - sum(normalized_weighted_classes**2)
    
    #_, value_indexes, counts = np.unique(data, return_counts=True, return_index=True)
    gini = len(counts)
    for count, class_weight in zip(counts, class_weights):
        gini *= (class_weight)
    return gini

In [6]:
def calculate_weighted_overall_metric(data, left_node_data, right_node_data, ml_task, metric_function):
    """
    Generalized impurity metric, computes weighted overall
    impurity/error w.r.t left and right nodes
    """
    # Labels
    left_label_column = left_node_data[:, -2]
    right_label_column = right_node_data[:, -2]
    parent_label_column = data[:, -2]
    # Sample weights
    left_sample_weight = left_node_data[:, -1]
    right_sample_weight = right_node_data[:, -1]
    parent_sample_weight = data[:, -1]

    if ml_task == 'classification':
        _, left_value_indexes, left_counts = np.unique(left_label_column, return_counts=True, return_index=True)
        _, right_value_indexes, right_counts = np.unique(right_label_column, return_counts=True, return_index=True)
        _, parent_value_indexes, parent_counts = np.unique(parent_label_column, return_counts=True, return_index=True)

        left_class_weights = np.array([np.take(left_sample_weight, np.where(left_label_column == left_label_column[value_index])[0]).sum() for value_index in left_value_indexes])
        right_class_weights = np.array([np.take(right_sample_weight, np.where(right_label_column == right_label_column[value_index])[0]).sum() for value_index in right_value_indexes])
        parent_class_weights = np.array([np.take(parent_sample_weight, np.where(parent_label_column == parent_label_column[value_index])[0]).sum() for value_index in parent_value_indexes])

        # class count * class weight, for respective classes
        left_weighted_classes = left_counts * left_class_weights
        right_weighted_classes = right_counts * right_class_weights
        parent_weighted_classes = parent_counts * parent_class_weights

#             weighted_prob_node_left = np.divide(np.sum(left_weighted_classes), np.sum(parent_weighted_classes))
#             weighted_prob_node_right = np.divide(np.sum(right_weighted_classes), np.sum(parent_weighted_classes))

        #weighted_prob_node_left = np.sum(left_weighted_classes / np.sum(parent_class_weights))
        #weighted_prob_node_right = np.sum(right_weighted_classes / np.sum(parent_class_weights))

        weighted_prob_node_left = np.sum(left_class_weights)# / np.sum(parent_weighted_classes)
        weighted_prob_node_right = np.sum(right_class_weights)# / np.sum(parent_weighted_classes)

    else:
        total_parent_sample_weight = np.sum(np.sum(left_sample_weight), np.sum(right_sample_weight))
        # Weighted probabilities of left and right node
        weighted_prob_node_left = np.sum(left_sample_weight) / total_parent_sample_weight
        weighted_prob_node_right = np.sum(right_sample_weight) / total_parent_sample_weight
    
    left_impurity = metric_function(left_node_data)
    right_impurity = metric_function(right_node_data)
    
    if left_impurity != None and right_impurity != None:
        overall_metric =  weighted_prob_node_left * left_impurity + weighted_prob_node_right * right_impurity

        # Compute final overall metric
        #print(f'Left gini: {metric_function(left_node_data)}, Right gini: {metric_function(right_node_data)}')
        #overall_metric = (weighted_prob_node_left * metric_function(left_node_data)) + (weighted_prob_node_right * metric_function(right_node_data))
        print(f'weighted_prob_node_left * w_i(left_node_data)): {weighted_prob_node_left} * {left_impurity} = {(weighted_prob_node_left * left_impurity)}')
        print(f'weighted_prob_node_right * w_i(right_node_data)): {weighted_prob_node_right} * {right_impurity} = {(weighted_prob_node_right * right_impurity)}')
        return overall_metric
    else:
        return None

In [6]:
def information_gain(parent_data_group, left_data_group, right_data_group, metric_function):
    
    # Calculate entropy for parent and left, right nodes
    parent_group_entropy = metric_function(parent_data_group)
    left_group_entropy = metric_function(left_data_group)
    right_group_entropy = metric_function(right_data_group)
    
    # Probabilities of left and right node
    num_examples_parent = len(parent_data_group)
    left_group_probability = len(left_data_group) / num_examples_parent
    right_group_probability = len(right_data_group) / num_examples_parent
    
    # Final equation for IG computation
    gain =  left_group_probability * metric_function(left_data_group) + right_group_probability * metric_function(right_data_group)
    
    return gain

In [7]:
def determine_type_of_feature(X):
    """
    Determine, if the feature is categorical or continuous
    """
    feature_types = []
    n_unique_values_treshold = 15 # Threshold for a numeric feature to be categorical

    n_samples, n_features = np.shape(X)

    for feature_i in range(n_features):
        unique_values = np.unique(X[:, feature_i])
        example_value = unique_values[0]

        if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
            feature_types.append("categorical")
        else:
            feature_types.append("continuous")

    return feature_types

In [8]:
def split_data(data, split_column_index, split_value):
    """ 
    Split data(examples) based on best split_column_index and split_value
    estimated using task specific splitting metric.
    """
    # Get values(from feature column) for the passed split_column index
    split_column_values = data[:, split_column_index]

    type_of_feature = feature_types[split_column_index]
    if type_of_feature == "continuous":
        print(f"Feature Type(Cont): {type_of_feature}")
        left_node_data = data[split_column_values <= split_value]
        right_node_data = data[split_column_values >  split_value]

    # feature is categorical   
    else:
        print(f"Feature Type(Cat): {type_of_feature}")
        left_node_data = data[split_column_values == split_value]
        right_node_data = data[split_column_values != split_value]
    return left_node_data, right_node_data

In [33]:
def determine_best_split(data, sample_weight, potential_splits, ml_task):
    """
    Iterate over each column_index (as keys) in potential_split (dict)
    Perform split(of examples) over each unique value and evaluate the split
    Identify the best split and return its feature index and value
    """
    # Stitch data with sample_weight towards the end
    data = np.concatenate((data, np.expand_dims(sample_weight, axis=1)), axis=1)

    # Best minimum gini index to be updated iteratively
    best_overall_metric = float('inf')

    for column_index in potential_splits:
        print(f"COLUMN {column_index}")
        for value in potential_splits[column_index]:
            print(f'column_index: {column_index}, value: {value}')
            left_node_data, right_node_data = split_data(data, split_column_index=column_index, split_value=value)

            if ml_task == "regression":
                current_overall_metric = calculate_weighted_overall_metric(data, left_node_data, right_node_data, ml_task,
                                                                                 metric_function=calculate_weighted_mse)
            else: # classification
                current_overall_metric = calculate_weighted_overall_metric(data, left_node_data, right_node_data, ml_task,
                                                                                 metric_function=gini_sk)
#             else: # classification
#                 #print("ran")
#                 current_overall_metric = information_gain(data, left_node_data, right_node_data, metric_function=calculate_entropy)
            if column_index == 0 and value == 4.4:
                print(left_node_data)
            # If a lower overall_metric is achieved update the index and value with the current
            if current_overall_metric != None and current_overall_metric <= best_overall_metric:
                best_overall_metric = current_overall_metric
                best_split_column_index = column_index
                best_split_value = value
            #print(f'best_overall_metric: {best_overall_metric}')
            #print('---')
        #print(f'Debug [1]: Best: {best_overall_metric}, index: {best_split_column_index}, value: {best_split_value}')
    return best_split_column_index, best_split_value, best_overall_metric

In [34]:
X = df.iloc[:, :-1].values

In [35]:
Y = df.iloc[:, -1].values

In [36]:
data = np.concatenate((X, np.expand_dims(Y, axis=1)), axis=1)

In [37]:
#global feature_types
feature_types = determine_type_of_feature(data)

In [38]:
sample_weight = np.full(len(data), np.divide(1, len(data))) 
potential_splits = get_potential_splits(data)
split_column_index, split_value, metric = determine_best_split(data, sample_weight, potential_splits, "classification")

COLUMN 0
column_index: 0, value: 4.3
Feature Type(Cont): continuous
cw: 4.444444444444445e-05, wn: 4.444444444444445e-05 ---> 1.0 - (cw/wn) == 1.0 - 4.444444444444445e-05/4.444444444444445e-05
cw: 0.32893333333333263, wn: 0.9867111111111161 ---> 1.0 - (cw/wn) == 1.0 - 0.32893333333333263/0.9867111111111161
weighted_prob_node_left * w_i(left_node_data)): 0.006666666666666667 * 0.0 = 0.0
weighted_prob_node_right * w_i(right_node_data)): 0.9933333333333323 * 0.6666366379892821 = 0.6621923937360195
column_index: 0, value: 4.4
Feature Type(Cont): continuous
cw: 0.0007111111111111113, wn: 0.0007111111111111113 ---> 1.0 - (cw/wn) == 1.0 - 0.0007111111111111113/0.0007111111111111113
cw: 0.31626666666666603, wn: 0.9473777777777824 ---> 1.0 - (cw/wn) == 1.0 - 0.31626666666666603/0.9473777777777824
weighted_prob_node_left * w_i(left_node_data)): 0.02666666666666667 * 0.0 = 0.0
weighted_prob_node_right * w_i(right_node_data)): 0.9733333333333323 * 0.6661662600863225 = 0.6484018264840199
[[4.4 2.9 

In [39]:
split_column_index

3

In [40]:
split_value

0.6

['continuous', 'continuous', 'continuous', 'continuous', 'categorical']

In [None]:
    label_column = data[:, -2]
    data_sample_weight =  data[:, -1]
    _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
    # Get summed weights for each class
    class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])

    weighted_classes = counts * class_weights
    normalized_weighted_classes = weighted_classes / sum(weighted_classes)
    #return 1.0 - (np.sum(counts**2) / np.sum(class_weights)**2)
    return 1.0 - sum(normalized_weighted_classes**2)

In [28]:
x = np.array([2, 2, 0, 0])
x1 = np.array([2, 2, 2, 0])
x2 = np.array([2, 2, 2, 2])

In [29]:
wt = np.full(len(x), np.divide(0.5, len(x)))
wt1 = np.array([0.2, 0.2, 0.05, 0.05])

In [30]:
def gineee(x, w):
    label_column = x
    data_sample_weight =  w
    _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
    class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])
    cw = np.sum(class_weights**2)
    wn = np.sum(data_sample_weight)**2
    gini = 1.0 - (cw/wn)
    print(counts)
    print(class_weights)
    return gini

In [32]:
gineee(x2, wt)

[4]
[0.5]


0.0

In [27]:
wt

array([0.125, 0.125, 0.125, 0.125])