In [15]:
import dt_inputs
import math
from collections import Counter, defaultdict
from functools import partial

def entropy(class_probs):
    return -1 * sum([pi * math.log(pi, 2) for pi in class_probs])

def class_probabilities(labels):
    tot_ct = len(labels)
    return [(ct / tot_ct) for ct in Counter(labels).values()]

def subset_entropy(subset):
    labels = [label for _, label in subset]
    class_probs = class_probabilities(labels)
    return entropy(class_probs)

def partition_entropy(subsets):
    tot_ct = sum([len(subset) for subset in subsets])
    return sum([(len(subset)/tot_ct)*subset_entropy(subset) for subset in subsets])

def partition_by(attribute, data):
    groups = defaultdict(list)
    for instance in data:
        instance_dict, label = instance
        instance_attribute_value = instance_dict.get(attribute)
        groups[instance_attribute_value].append(instance)
    return groups

def entropy_partitioned_by(data, attribute):
    groups = partition_by(attribute, data)
    return partition_entropy(groups.values())

def classify(tree, instance_dict):
    if tree in [True, False]:
        return tree
    attribute, subtree_dict = tree
    instance_attribute_value = instance_dict.get(attribute)
    if instance_attribute_value not in subtree_dict:
        instance_attribute_value = None
    subtree = subtree_dict[instance_attribute_value]
    return classify(subtree, instance_dict)

def build_tree_id3(data, split_candidates=None):
    if split_candidates is None:
        split_candidates = data[0][0].keys()
    ct_true = sum([1 for _, label in data if label == True])
    ct_false = sum([1 for _, label in data if label == False])
    if ct_true == 0:
        return False
    if ct_false == 0:
        return True
    best_attribute = min(split_candidates, key=partial(entropy_partitioned_by, data))
    other_candidates = [c for c in split_candidates if c != best_attribute]
    subtree_dict = {
        av: build_tree_id3(subset, other_candidates) for av, subset in partition_by(best_attribute, data).items()
    }
    subtree_dict[None] = ct_true > ct_false
    return (best_attribute, subtree_dict)


In [18]:
data = dt_inputs.INPUTS
tree = build_tree_id3(data)
input1 = {'level':'Junior', 'lang':'Java', 'tweets':'yes', 'phd':'no'}
input2 = {'level':'Junior', 'lang':'Java', 'tweets':'yes', 'phd':'yes'}
print(classify(tree, input1), classify(tree, input2))
tree

True False


('level',
 {'Senior': ('tweets', {'no': False, 'yes': True, None: False}),
  'Mid': True,
  'Junior': ('phd', {'no': True, 'yes': False, None: True}),
  None: True})