# Decision Tree Classifier

Classification And Regression Tree 

# References

https://www.youtube.com/watch?v=LDRbO9a6XPU

I shamelessly borrowed code from the notebook in the example referenced in the video. https://github.com/random-forests/tutorials/blob/master/decision_tree.ipynb

In [None]:
from math import log
from collections import Counter
from numbers import Number

In [None]:
header = ["color", "diameter", "label"]
label_index = 2
data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Green', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]

# Gini Impurity

https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity

"The Gini impurity can be computed by summing the probability p_i of an item with label_i being chosen times the probability of a mistake in categorizing that item. It reaches its minimum (zero) when all cases in the node fall into a single target category."

-between 0 and 1 where lower values mean less mixing at a node
chance of being incorrect if you randomly assign a label to an item in the set

-my intuition for this is well mixed (0.5) it contributes a lot to the output. As it becomes more or less present it contributes less and less to the output. As there are more labels there are more contributors to the output.

In [None]:
def gini(labels):    
    impurity = 0
    for label, count in Counter(labels).items():
        prob_chosen = count / float(len(labels))
        impurity += prob_chosen * (1 - prob_chosen)
    
    return impurity

In [None]:
gini(['a','a','a'])

In [None]:
gini(['a','a','c'])

In [None]:
gini(['a','b','c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])

# Information gain

Used to find the question that reduces uncertainty the most. Describes how much a question helps unmix labels at a node.

It is calculated by finding the diff of the impurity before the split and the weighted avg of the impurity in each of the outputs after the split. Weighted avg is used because the size of the split matters (e.g. splitting one item and leaving a group with a lot of impurity).

In [None]:
def info_gain(labels_in, labels_out_left, labels_out_right):
    gini_in = gini(labels_in)
    p = len(labels_out_left) / float(len(labels_in))
    
    return gini(labels_in) - p * gini(labels_out_left) - (1 - p) * gini(labels_out_right)

In [None]:
info_gain(labels_in=['a','b'], labels_out_left=['a'], labels_out_right=['b'])

In [None]:
info_gain(labels_in=['a','b', 'b'], labels_out_left=['a', 'b'], labels_out_right=['b'])

In [None]:
info_gain(labels_in=['a','b', 'b'], labels_out_left=['a', 'b', 'b'], labels_out_right=[])

# Define all possible node rules
The set of rules is defined from the input data. Try every value for every label.

To build every possible rule we split the data for every unique feature value.

In [None]:
class Rule:
    def __init__(self, column_name, column_index, column_value):
        self.name = column_name
        self.index = column_index
        self.value = column_value
        self.is_numeric = isinstance(column_value, Number)

    def match(self, row):
        val = row[self.index]
        if self.is_numeric:
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if self.is_numeric:
            condition = ">="
        return "Is %s %s %s?" % (
            self.name, condition, str(self.value))

In [None]:
Rule(column_name='color', column_index=0, column_value='green')

In [None]:
Rule(column_name='diameter', column_index=1, column_value=2)

In [None]:
def all_rules(header, data, label_index):
    rules = []
    
    for index, name in enumerate(header):
        if index != label_index:
            # get unique values for index
            vals = set()
            for row in data:
                vals.add(row[index])
                
            # build a rule for each feature value    
            for val in vals:    
                rules.append(Rule(name, index, val))
            
    return set(rules)

In [None]:
rules = all_rules(header, data, label_index)
rules

# Find best split
Next define a method that finds the rule that maximizes the information gain. 

To find the best split, try every rule and pick the one with the highest info gain.

In [None]:
def get_labels(data, label_index):
    return list(map(lambda row: row[label_index], data))

def best_split(data, rules):
#     print("\n+++++++++")
#     print("input: {}".format(get_labels(data, label_index)))
#     print("\n")

    max_info_gain = 0.
    max_rule = None
    max_left = None
    max_right = None
    
    for rule in rules:
        # for each rule bucket the results into two groups
        left = []
        right = []
        for row in data:
            if rule.match(row):
                right.append(row)
            else:
                left.append(row)
        
        new_info_gain = info_gain(get_labels(data, label_index), get_labels(left, label_index), get_labels(right, label_index))
        
#         print("rule: {}".format(rule))
#         print("info_gain: {}".format(new_info_gain))
#         print("left: {}".format(get_labels(left, label_index)))
#         print("right: {}".format(get_labels(right, label_index)))
#         print("\n")
        
        if new_info_gain > max_info_gain:
            max_info_gain = new_info_gain
            max_rule = rule
            max_left = left
            max_right = right
            
#     print("max info gain: {}".format(max_info_gain))
#     print("max rule: {}".format(max_rule))
#     print("max left: {}".format(max_left))
#     print("max right: {}".format(max_right))
#     print("+++++++++")        
            
    return max_info_gain, max_rule, max_left, max_right             

In [None]:
best_split(data, rules)

# Building the full tree

In [None]:
class Node:
    def __init__(self,
                 rule,
                 false_node,
                 true_node):
        self.rule = rule
        self.false_node = false_node
        self.true_node = true_node
        
    def is_leaf(self):
        return False

## Leaf nodes

When a leaf node is reached the decision tree returns the label value for objects end at this leaf

In [None]:
class Leaf:
    def __init__(self, labels):
        tot_count = len(labels)
        self.predictions = {}
        for label, count in Counter(labels).items():
            self.predictions[label] = float(count) / tot_count
        
    def is_leaf(self):
        return True   

## Build tree

In [None]:
def build_tree(header, training_data, label_index):
    # create all rules from training data
    rules = all_rules(header, training_data, label_index)
    
    # keep finding best split until cannot be split further
    return build_tree_rec(training_data, rules, label_index)
    
    
def build_tree_rec(training_data, rules, label_index):
    # base case: impurity is 0 (completely separated)
    labels = get_labels(training_data, label_index)
    if gini(labels) == 0:
        print("unmixed: {}\n".format(labels))
        return Leaf(labels)
    
    # split data
    info_gain, rule, false_data, true_data = best_split(training_data, rules)
    
    # stop if info gain is 0. that means it can't be split further
    if info_gain == 0:
        print("no more info gain: {}\n".format(labels))
        return Leaf(labels)
    
    # build decision tree for each outcome
    print(rule)
    print("new node false: {}".format(false_data))
    print("new node true: {}\n".format(true_data))
    return (Node(
        rule, 
        build_tree_rec(false_data, rules, label_index),
        build_tree_rec(true_data, rules, label_index)
    ))
    

In [None]:
tree = build_tree(header, data, label_index)

In [None]:
def print_tree(node, spacing=""):
    """World's most elegant tree printing function."""

    # Base case: we've reached a leaf
    if node.is_leaf():
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.rule))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_node, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_node, spacing + "  ")

In [None]:
print_tree(tree)

# Using decision tree

In [None]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if node.is_leaf():
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.rule.match(row):
        return classify(row, node.true_node)
    else:
        return classify(row, node.false_node)

In [None]:
print(data[0])
classify(data[0], tree)

In [None]:
classify(['Yellow', 3], tree)

# Tree Stats

In [None]:
def tree_depth(tree):
    if tree.is_leaf():
        return 1
    
    return 1 + max(tree_depth(tree.false_node), tree_depth(tree.true_node))

In [None]:
tree_depth(tree)

In [None]:
def leaf_count(tree):
    if tree.is_leaf():
        return 1
    
    return leaf_count(tree.false_node) + leaf_count(tree.true_node)

In [None]:
leaf_count(tree)

# Feature transformations/Normalization

Let's try a feature transformation and see if it changes the tree

In [None]:
header = ["color", "diameter", "label"]
label_index = 2
data = [
    ['Green', log(3), 'Apple'],
    ['Yellow', log(3), 'Apple'],
    ['Red', log(1), 'Grape'],
    ['Green', log(1), 'Grape'],
    ['Yellow', log(3), 'Lemon'],
]

In [None]:
tree_log = build_tree(header, data, label_index)

In [None]:
print_tree(tree_log)

In [None]:
print_tree(tree)

It's the same tree. The absoluate value doesn't make a difference. It is the relative splits within the data that matter.