In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
class Leaf():
    def __init__(self, label):
        self.label = label

class Node():
    def __init__(self, column, split_value, left_subtree, right_subtree):
        """
        column: int
        split_value: float
        left_subtree, right_subtree: Leaf/ Node
        """
        self.column = column
        self.split_value = split_value
        self.left_subtree = left_subtree
        self.right_subtree = right_subtree

In [3]:
def get_tree_entropy(train, labels = [0,1,2]):
    if train.empty:
        return 0
    entropy = 0
    for label in labels:
        class_data = train[train['y'] == label]
        p = len(class_data)/ len(train)
        entropy -= p * np.log(p + 1e-20)
    return entropy

def get_entropy(train, column, value):
    train_left, train_right = split_data(train, column, value)
    return get_tree_entropy(train_left) + get_tree_entropy(train_right)

    
def information_gain(train, column, tree_entropy):
    min_entropy = float('inf')
    split_value = 0
    for value in train[train.columns[column]]:
        entropy = get_entropy(train, column, value)
        if entropy < min_entropy:
            min_entropy, split_value = entropy, value
    
    ig = tree_entropy - min_entropy
    return ig, split_value

In [4]:
def split_data(train, column, split_value):
    return train[train[column_dict[column]] <= split_value], train[train[column_dict[column]] > split_value]
    
def find_column(train, columns):
    max_ig, ig = 0, 0
    best_column, best_split_value = 0, 0
    tree_entropy = get_tree_entropy(train)
    for column in columns:
        ig, split_value = information_gain(train, column, tree_entropy)
        if ig > max_ig:
            ig = max_ig
            best_column = column
            best_split_value = split_value
            
    return best_column, best_split_value

def create_tree(train, depth, max_depth, columns):
    print('Tree Entropy', get_tree_entropy(train))
    if train.empty:
        return None
    if depth >= max_depth or get_tree_entropy(train) < 0.1:
        counts = np.bincount(train['y'].values)
        return Leaf(np.argmax(counts))

    column, split_value = find_column(train, columns)
    train_left, train_right = split_data(train, column, split_value)
    node_left  = create_tree(train_left, depth+1, max_depth, columns)
    node_right = create_tree(train_right, depth+1, max_depth, columns)
    node = Node(column, split_value , node_left, node_right)
    
    return node

In [5]:
column_dict = {0:'x0', 1:'x1', 2:'x2', 3:'x3'}
df = pd.read_csv('trees.csv')
train, test = df[:500], df[500:]
columns =[0,1,2,3]

node = create_tree(train, 0, 3, columns)
node.column    

Tree Entropy 1.0967198394695579
Tree Entropy 0.10719288797064877
Tree Entropy 0.0
Tree Entropy 0.0
Tree Entropy 0.8884362530837849
Tree Entropy 0.0
Tree Entropy 0.7096755216035957
Tree Entropy 0.0
Tree Entropy 0.7091203186533157


3

In [6]:
train.head()

Unnamed: 0,x0,x1,x2,x3,y
0,5.742,2.943,5.128,2.479,2
1,6.032,2.27,4.039,0.996,1
2,5.53,4.157,1.573,0.113,0
3,7.218,2.79,6.309,1.846,2
4,5.043,3.607,1.446,0.062,0


In [7]:
a = [0,0,1,1,1,1,1,2]
counts = np.bincount(a)
np.argmax(counts)

1

In [10]:
node.column

3

In [8]:
root = node

while root:
    print(root.column)
    root = root.left_subtree

3
2


AttributeError: 'Leaf' object has no attribute 'column'

In [None]:
def predict(root, input_x):
    """
    check if x(node.column) < node.split_value
    if condition is true, continue using left otherwise right subtree.
    If leaf reached, return leaf stored prediction.
    """
    