In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
infile = 'small_train.tsv'

In [4]:
df = pd.read_csv(infile, sep='\t')
dataset = df.to_numpy()
x = dataset[:,:-1]
y = dataset[:,-1]

In [6]:
def get_entropy(y):
    numEntries = len(y)
    labelCounts = Counter(y)
    entropy = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        entropy -= prob * np.log2(prob)
    return entropy

In [7]:
get_entropy(y)

0.996316519558962

In [11]:
def info_gain(X, Y):
        M, N = X.shape
        
        y_entropy = calc_entropy(Y)
        IG = [y_entropy for _ in range(N)]
        
        for i in range(N):
            col = X[:, i]
            binary_values = np.unique(col)
            for bv in binary_values:
                Y_hat = Y[col == bv]
                IG[i] -= get_entropy(Y_hat) * len(Y_hat) / len(Y)
                
        return np.array(IG)

In [28]:
def get_split_index(X, Y):
    gain = info_gain(X,Y)
    split_index = gain.argmax()
    return split_index if gain[split_index]>0 else None
    

In [80]:
class DecisionTree:
    def __init__(self, max_depth):
        # stem node
        self.max_depth = max_depth
        self.children = {} # key is the attribute value, value is the child node
        self.split_index = None
        # leaf node
        self.label = None
        
    def Train(self, X, Y):
        # convert this node to a leaf node
        if self.max_depth <= 0 or self.split_index is None:
            self.split_index = None
            self.label = Counter(Y).most_common(1)[0][0]
            return
            
        for val in np.unique(X):
            self.children[val] = DecisionTree(self.max_depth - 1)
            self.children[val] = Train([X[X[:,self.split_index]==val]], Y[X[:,self.split_index]==val])
        
    def predict_recursively(self, X):
        if self.split_index is None:
            return self.label
        return self.children[X[self.split_index]].predict_recursively(x)
    
    def predict(self, X):
        label = []
        for row in X:
            label.append(self.predict_recursively(row))
        return np.array(label)
    
def get_error(Y, Y_hat):
    return np.sum(Y != Y_hat) / len(Y)

In [81]:
dtree = DecisionTree(3)

In [82]:
dtree.Train(x,y)

In [83]:
dtree.predict(x)

array(['democrat', 'democrat', 'democrat', 'democrat', 'democrat',
       'democrat', 'democrat', 'democrat', 'democrat', 'democrat',
       'democrat', 'democrat', 'democrat', 'democrat', 'democrat',
       'democrat', 'democrat', 'democrat', 'democrat', 'democrat',
       'democrat', 'democrat', 'democrat', 'democrat', 'democrat',
       'democrat', 'democrat', 'democrat'], dtype='<U8')

In [85]:
y_hat = dtree.predict(x)

In [86]:
get_error(y,y_hat)

0.4642857142857143