# ID3 Classifier from Scratch

In [1]:
import pandas as pd
import numpy as np
eps = np.finfo(float).eps
from anytree import Node, NodeMixin, RenderTree
from anytree.importer import DictImporter
from numpy import log2 as log
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import Counter
from pprint import pprint
import json
import pydot
%matplotlib inline

In [2]:
# Load data file
bank = pd.read_csv('./bank/bank-full.csv', sep=';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
bank.shape

(45211, 17)

In [4]:
numerical_features = list(bank.describe().columns)
bank.describe() # Describes the features that are numerical

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [5]:
# drop numerical features
bank.drop(numerical_features, axis = 1, inplace = True)

In [6]:
features = list(bank.drop(['y'], axis = 1, inplace = False).columns) # independent variables

In [7]:
# Missing values: None 
bank[bank.isnull().any(axis=1)].count()

job          0
marital      0
education    0
default      0
housing      0
loan         0
contact      0
month        0
poutcome     0
y            0
dtype: int64

## The ID3 Algorithm

In [8]:
# Takes a list of probabilities and calculates their entropy
def entropy(probs):
    return sum([-prob*math.log(prob, 2) for prob in probs])

# Takes a list of items with discrete values and returns the entropy for those items.
def entropy_of_list(a_list):
    cnt = Counter(x for x in a_list)
    num_instances = len(a_list)*1.0
    probs = [x / num_instances for x in cnt.values()]
    return entropy(probs)
    
# The initial entropy of the dependent attribute for our dataset.
total_entropy = entropy_of_list(bank['y'])
print(total_entropy)

0.5206312212538164


In [9]:
# Takes a DataFrame of attributes, and quantifies the entropy of a target 
# attribute after performing a split along the values of another attribute.

def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    df_split = df.groupby(split_attribute_name)
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    if trace: 
        print(df_agg_ent)
        
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy-new_entropy

print('\nInfo-gain for the poutcome attribute is ' + str(information_gain(bank, 'poutcome', 'y')))


Info-gain for the poutcome attribute is 0.042411254522755404


In [10]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    cnt = Counter(x for x in df[target_attribute_name])
    
    if len(cnt) == 1: # Leaf / Homogenous node
        return list(cnt.keys())[0]
    
    elif df.empty or (not attribute_names): # Data instance empty?
        return default_class 
    
    else:
        index_of_max = list(cnt.values()).index(max(cnt.values())) 
        default_class = list(cnt.keys())[index_of_max]
        
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
        index_of_max = gainz.index(max(gainz)) 
        best_attr = attribute_names[index_of_max]
        
        tree = {best_attr:{}}
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

In [11]:
tree = id3(bank, 'y', features)

In [12]:
# print(treeify(tree))

## Classification

In [13]:
def classify(instance, tree, default=None):
    attribute = list(tree.keys())[0]
    if instance[attribute] in tree[attribute].keys():
        result = tree[attribute][instance[attribute]]
        if isinstance(result, dict):
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [14]:
training_data = bank.iloc[0:36169, :] #  80% of the entire data goes for training
test_data = bank.iloc[36169:, :]      #  20% of the entire data goes for testing
train_tree = id3(training_data, 'y', features)

In [15]:
test_data['predicted'] = test_data.apply(classify, axis=1, args=(train_tree, 'yes'))

print('Accuracy is ' + str(sum(test_data['y']==test_data['predicted'] ) / (1.0*len(test_data.index))))

Accuracy is 0.5747622207476222


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## ID3 Tree

In [16]:
# working on a smaller sample for a comprehensible decision tree
sample = bank.sample(50)
tree = id3(sample, 'y', features)

In [17]:
def treeify(tree):
    tree_str = json.dumps(tree, indent=4)
    tree_str = tree_str.replace("\n    ", "\n")
    tree_str = tree_str.replace('"', "")
    tree_str = tree_str.replace(',', "")
    tree_str = tree_str.replace("{", "")
    tree_str = tree_str.replace("}", "")
    tree_str = tree_str.replace("    ", " | ")
    tree_str = tree_str.replace("  ", " ")
    return tree_str

In [18]:
def draw(parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(node, parent=None):
    for k,v in node.items():
        if isinstance(v, dict):
            # We start with the root node whose parent is None
            # we don't want to graph the None node
            if parent:
                draw(parent, k)
            visit(v, k)
        else:
            draw(parent, k)
            # drawing the label using a distinct name
            draw(k, k+'_'+v)

In [19]:
# graph = pydot.Dot(graph_type='graph')
# visit(tree)
# graph.write_png('ID3 decision tree.png')

In [20]:
# print(treeify(tree))


month: 
 | apr: no
 | aug: 
 | | job: 
 | | | management: no
 | | | retired: yes
 | | | technician: no
 | | 
 | 
 | dec: yes
 | feb: no
 | jul: no
 | jun: 
 | | job: 
 | | | admin.: no
 | | | blue-collar: no
 | | | housemaid: no
 | | | self-employed: no
 | | | technician: yes
 | | | unemployed: no
 | | 
 | 
 | mar: 
 | | job: 
 | | | management: yes
 | | | retired: no
 | | 
 | 
 | may: 
 | | job: 
 | | | admin.: no
 | | | blue-collar: no
 | | | management: no
 | | | services: no
 | | | technician: 
 | | | | marital: 
 | | | | | divorced: no
 | | | | | married: yes
 | | | | 
 | | | 
 | | 
 | 
 | nov: no
 | oct: yes
 | sep: 
 | | job: 
 | | | admin.: no
 | | | blue-collar: yes
 | | | retired: no
 | | 
 | 


