# Decision Tree Classifier from scratch

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import math

In [None]:
iris = load_iris()
data = iris.data
target = iris.target

In [None]:
class decision_tree_classifier:
    
    def __init__(self, max_depth=0):
        self.max_depth = max_depth
    
    def count_target_values(self, y):
        results = {}
        counts = np.unique(y, return_counts=True)
        for i,j in zip(*counts):
            results[i] = j
        return results
    
    def entropy(self, y):
        results = self.count_target_values(y)
        size = y.shape[0]
        entropy = 0.0
        for cl in results.keys():
            p = float(results[cl]) / size
            entropy -= p*np.log(p)        
        return entropy
    
    def split(self, X, y, feature, value):
    
        # Check if feature's value is numerical or categorical
        split = None
        if isinstance(value, int) or isinstance(value, float):
            split = lambda x: x[feature] >= value
        else:
            split = lambda x: x[feature] == value

        split1 = [i for i, row in enumerate(X) if split(row)]
        split2 = [i for i, row in enumerate(X) if not split(row)]

        X1 = X[split1]
        y1 = y[split1]
        X2 = X[split2]
        y2 = y[split2]

        return X1, y1, X2, y2
    
    def fit(self, X, y, depth=0):
    
        best_gain = 0.0
        best_criteria = None
        best_sets = None

        parent_entropy = self.entropy(y)
        features = X.shape[1]

        for feat in range(features):
            values = np.unique(X[:,feat])
            for val in values:
                X1, y1, X2, y2 = self.test_split(X, y, feat, val)
                p1 = float(X1.shape[0]) / y.shape[0]

                info_gain = parent_entropy - p1*self.entropy(y1) - (1-p)*self.entropy(y2)

            if info_gain > best_gain:
                best_sets = (X1, y1, X2, y2)
                best_gain = info_gain
                best_criteria = (feat, val)

        return best_criteria

## To Do
* Add predict method
* Be able to save/print the tree
* Add one-vs-rest functionality for multi-class problems
* Add Gini Index criterion

In [None]:
votes = np.array([[1,1,1,1],
                 [1,0,0,0],
                 [1,0,1,0],
                 [0,1,1,1],
                 [0,0,0,0],
                 [1,1,0,0],
                 [0,1,0,0],
                 [0,0,1,1]])

X = votes[:,:-1]
y = votes[:,-1].reshape(-1,1)