In [1]:
from math import log
import operator
import pandas as pd

In [2]:
# column names
names = ["target", "cap-shape", "cap-surface", "cap-color", "bruises",
        "odor", "gill-attachment", "gill-spacing", "gill-size",
        "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring",
        "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring",
        "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color",
        "population", "habitat"]

In [3]:
# training data
df = pd.read_csv('../Resources/agaricus-lepiotadata.txt', header=None, names=names)

In [4]:
# select all columns of type object
cat_columns = df.select_dtypes(['object']).columns

In [5]:
# convert all object columns to type category
df[cat_columns] = df[cat_columns].apply(lambda x: x.astype('category'))

In [6]:
# convert categorical data to numeric
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [7]:
# convert DataFrame to 2D-list
data = df.values.tolist()

In [8]:
def entropy(data):
    # calculate the entropy of a data set
    
    entries = len(data)
    labels = {}
    # import pdb; pdb.set_trace()
    for row in data:
        label = row[0]
        if label not in labels.keys():
            labels[label] = 0
        labels[label] += 1
    entropy = 0.0
    for key in labels:
        probability = float(labels[key])/entries
        entropy -= probability * log(probability, 2)
    return entropy

In [9]:
def split(data, att, val):
    # split data on a given attribute
    
    newdata = []
    for row in data:
        if row[att] == val:
            reduced = row[:att]
            reduced.extend(row[att+1:])
            newdata.append(reduced)
    return newdata

In [10]:
def chooseAtt(data):
    # choose best attribute to split on
    
    attributes = len(data[0])
    exp_ent = entropy(data)
    gain = 0.0
    bestatt = -1
    for row in range(1, attributes):
        attlist = [ex[row] for ex in data]
        uniquevals = set(attlist)
        newentropy = 0.0
        for value in uniquevals:
            newdata = split(data, row, value)
            probability = len(newdata)/float(len(data))
            newentropy += probability * entropy(newdata)
        infogain = exp_ent - newentropy
        if (infogain > gain):
            gain = infogain
            bestatt = row
    return bestatt

In [11]:
def majVotting(majclass):
    # return majority class
    
    classcount = {}
    for vote in majclass:
        if vote not in classcount.keys():
            classcount[vote] = 0
        classcount[vote] += 1
        sortedclass = sorted(classcount.items(), key=operator.itemgetter(1), reverse=True)
        return sortedclass[0][0]

In [12]:
def id3Tree(data, labels):
    # main class - generate decision tree
    import pdb; pdb.set_trace()
    classifier = [row[0] for row in data]
    if (classifier.count(classifier[0]) == len(classifier)):
        return classifier[0]
    if (len(data[0]) == 1):
        return majVotting(classifier)
    
    best_att = chooseAtt(data)
    best_label = labels[best_att]
    tree = {best_label: {}}
    del(labels[best_att])
    label_values = [row[best_att] for row in data]
    uniquevals = set(label_values)
    
    for value in uniquevals:
        sublabels = labels[:]
        tree[best_label][value] = id3Tree(split(data, best_att, value), sublabels)
    
    return tree

In [None]:
id3Tree(data, names)

> <ipython-input-12-4d2c251a0b62>(4)id3Tree()
-> classifier = [row[0] for row in data]
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(5)id3Tree()
-> if (classifier.count(classifier[0]) == len(classifier)):
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(7)id3Tree()
-> if (len(data[0]) == 1):
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(10)id3Tree()
-> best_att = chooseAtt(data)
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(11)id3Tree()
-> best_label = labels[best_att]
(Pdb) 
> <ipython-input-12-4d2c251a0b62>(12)id3Tree()
-> tree = {best_label: {}}
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(13)id3Tree()
-> del(labels[best_att])
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(14)id3Tree()
-> label_values = [row[best_att] for row in data]
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(15)id3Tree()
-> uniquevals = set(label_values)
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(17)id3Tree()
-> for value in uniquevals:
(Pdb) uniquevals
{0, 1, 2, 3, 4, 5, 6}
(Pdb) n
> <ipython-input-12-4d2c251a0b62>(18)id3Tree()
-> sublabels 