In [4]:
'''
決策樹 (decision tree)
'''
# 亂度 (entropy): 能帶出多少訊息
# 混亂、不確定性
from collections import Counter, defaultdict
from functools import partial
import math, random

def entropy(class_probabilities):
    '''
    給定分類機率列表，計算出亂度
    '''
    return sum(-p * math.log(p, 2)
              for p in class_probabilities
              if p)    # 忽略機率為零的項目

def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count
           for count in Counter(labels).values()]

def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

# 切分亂度 (Entropy of Partition)
def partition_entropy(subsets):
    '''
    將數據資料切分成子集合後，計算相應的切分亂度
    各個子集合都是已標示分類的數據資料列表所組成的列表
    '''
    total_count = sum(len(subset) for subset in subsets)
    return sum(data_entropy(subset) * len(subset) / total_count
              for subset in subsets)

In [12]:
 inputs = [
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
    ]

def partition_by(inputs, attribute):
    '''
    每個輸入都是 (attribute_dict, label) 這樣的成對資料
    結果會送出一個dict字典：attribute_value -> inputs
    '''
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]      # 取得指定屬性的值
        group[key].append(input)       # 把 input 添加到正確的列表中
    return group_by(inputs, lambda x: x[0][attribute])

def partition_entropy_by(inputs, attribute):
    '''
    計算出給定切分結果相應的切分亂度
    '''
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

for key in ['level', 'lang', 'tweets', 'phd']:
    print(key, partition_entropy_by(inputs, key))
print()

# level 0.693536138896
# lang 0.860131712855
# tweets 0.78845045457308
# phd 0.892158928262

senior_inputs = [(input, label)
                     for input, label in inputs if input["level"] == "Senior"]

for key in ['lang', 'tweets', 'phd']:
    print(key, partition_entropy_by(senior_inputs, key))
print()

NameError: name 'group' is not defined

In [20]:
def classify(tree, input):
    '''
    用給定的決策樹，對輸入項進行分類
    '''
    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree

    # otherwise find the correct subtree
    attribute, subtree_dict = tree

    subtree_key = input.get(attribute)  # None if input is missing attribute

    if subtree_key not in subtree_dict: # if no subtree for key,
        subtree_key = None              # we'll use the None subtree

    subtree = subtree_dict[subtree_key] # choose the appropriate subtree
    return classify(subtree, input)     # and use it to classify the input


def build_tree_id3(inputs, split_candidates=None):

    # if this is our first pass,
    # all keys of the first input are split candidates
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()

    # count Trues and Falses in the inputs
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues

    if num_trues == 0:                  # if only Falses are left
        return False                    # return a "False" leaf

    if num_falses == 0:                 # if only Trues are left
        return True                     # return a "True" leaf

    if not split_candidates:            # if no split candidates left
        return num_trues >= num_falses  # return the majority leaf

    # otherwise, split on the best attribute
    best_attribute = min(split_candidates,
        key=partial(partition_entropy_by, inputs))

    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates
                      if a != best_attribute]

    # recursively build the subtrees
    subtrees = { attribute : build_tree_id3(subset, new_candidates)
         for attribute, subset in partitions.items() }

    subtrees[None] = num_trues > num_falses 　　　# default case

    return (best_attribute, subtrees)

print("building the tree")
tree = build_tree_id3(inputs)
print(tree)

print("Junior / Java / tweets / no phd", classify(tree,
    { "level" : "Junior",
      "lang" : "Java",
      "tweets" : "yes",
      "phd" : "no"} ))

print("Junior / Java / tweets / phd", classify(tree,
    { "level" : "Junior",
             "lang" : "Java",
             "tweets" : "yes",
             "phd" : "yes"} ))

print("Intern", classify(tree, { "level" : "Intern" } ))
print("Senior", classify(tree, { "level" : "Senior" } ))

SyntaxError: invalid character in identifier (<ipython-input-20-f3c5c25ecb6b>, line 54)

In [25]:
'''
隨機森林 (random forest)
'''
def forest_classify(trees, input):
    votes = [classify(tree, input) for tree in trees]
    vote_counts = Counter(votes)
    return vote_counts.most_common(1)[0][0]

# 如果可選擇的選項已經夠少，就全部採用
if len(split_candidates) <= self.num_split_candidates:
    sampled_split_candidates = split_candidates
# 要不然的話，就隨機取出其中一部分
else:
    sampled_split_candidates = random.sample(split_candidates, self.num_split_candidates)
    
# 現在只根據這些可選項目，挑選出其中的最佳屬性
best_attribute = min(sampled_split_candidates,
                    key = partial(partition_entropy_by, inputs))

partitions = partition_by(inputs, best_attribute)

NameError: name 'split_candidates' is not defined