In [65]:
from collections import Counter, defaultdict
from math import log

In [87]:
def entropy(class_probabilities):
    return sum(-p * log(p, 2) \
              for p in class_probabilities if (p)) # 확률이 0인 경우 제외

def class_probabilities(labels):
    total_count = len(labels)
    return [ count / total_count for count in Counter(labels).values()]

def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    #print(labels)
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

In [88]:
v =['red', 'blue', 'red', 'green', 'blue', 'blue']
cp = class_probabilities(v)
print(cp)
print(entropy(cp))

[0.3333333333333333, 0.5, 0.16666666666666666]
1.4591479170272448


In [89]:
inputs = [
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
    ]

In [92]:
def partition_entropy(subsets):
    total_count = sum(len(subset) for subset in subsets)
    # print(total_count)
    return sum(data_entropy(subset) * len(subset) / total_count for subset in subsets)

def partition_by(inputs, attribute):
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]
        groups[key].append(input)
    return groups

def partition_entropy_by(inputs, attribute):
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

for key in ['level', 'lang', 'tweets', 'phd']:
    #print(key, partition_by(inputs, key))
    print(key, partition_entropy_by(inputs, key))

[False, False, False, True, True]
[True, True, True, True]
[True, True, False, True, False]
level 0.6935361388961919
[False, False, True]
[True, True, False, True, True, True, False]
[True, False, True, True]
lang 0.8601317128547441
[False, False, True, True, False, True, False]
[True, False, True, True, True, True, True]
tweets 0.7884504573082896
[False, True, True, True, False, True, True, True]
[False, False, True, True, True, False]
phd 0.8921589282623617


In [93]:
senior_inputs = [(input, label) for input, label in inputs if input['level'] == 'Senior']
for key in ['lang', 'tweets', 'phd']:
    print(key, partition_entropy_by(senior_inputs, key))

[False, False]
[False, True]
[True]
lang 0.4
[False, False, False]
[True, True]
tweets 0.0
[False, False, True]
[False, True]
phd 0.9509775004326938


In [95]:
def classify(tree, input):
    """classify the input using the given decision tree"""

    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree

    # otherwise find the correct subtree
    attribute, subtree_dict = tree

    subtree_key = input.get(attribute)  # None if input is missing attribute

    if subtree_key not in subtree_dict: # if no subtree for key,
        subtree_key = None              # we'll use the None subtree

    subtree = subtree_dict[subtree_key] # choose the appropriate subtree
    return classify(subtree, input)     # and use it to classify the input