In [1]:
import numpy as np
import pandas as pd

In [13]:
col_names = ['sepal_length','sepal_width','petal_length','petal_width','type']
data = pd.read_csv("iris.csv",  header=None,skiprows=1, names=col_names)
data['type'], class_names = pd.factorize(data['type'])
data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
6,5.4,3.9,1.7,0.4,0
7,4.6,3.4,1.4,0.3,0
8,5.0,3.4,1.5,0.2,0
9,4.4,2.9,1.4,0.2,0
10,4.9,3.1,1.5,0.1,0


In [6]:
class_names

Index(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='object')

In [14]:
data.reset_index(drop=True, inplace=True)

In [17]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [19]:
class Node():
    def __init__(self, feature_index=None, threshold = None, left=None,right=None, info_gain=None,value=None):
        ''' constructor '''

        #for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain

        #for leaf node
        self.value = value

In [20]:
class  DecisionTreeClassifier():
    def __init__(self,min_samples_split=2, max_depth=2):
        ''' constructor'''

        #initialize the root of the tree
        self.root = None

        #stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''

        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)

        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            #find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            #check if information gain is positive
            if best_split["info_gain"]>0:
                #recur left
                left_subtree = self.build_tree(best_split["dataset_left"],curr_depth+1)
                #recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                #return decision node
                return Node(best_split["feature_index"], best_split["threshold"],left_subtree,right_subtree, best_split["info_gain"])

        #compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        #return leaf node
        return Node(value=leaf_value)


        

In [None]:
def get_best_split(self,dataset,num_samples,num_features):
    ''' function to find the best split '''

    #dictionary to store the best split
    best_split = {}
    max_info_gain = -float("inf")

    # loop over all the features

    for feature_index in range(num_features):
        feature_values = dataset[:, feature_index]
        possible_thresholds = np.unique(feature_values)
        # loop over all the feature values present in the data
        for threshold in possible_thresholds:
            #get current split
            dataset_left, dataset_righ = self.split(dataset, feature_index, threshold)
            # check if childs are not null
            if len(dataset_left)>0 and len(dataset_right)>0:
                y, left_y, right_y = dataset[:,-1], dataset_left[:,-1], dataset_right[:, -1]
                #compute information gain
                curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                #update the best split if needed
                if curr_info_gain > max_info_gain:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = threshold
                    best_split["dataset_left"] = dataset_left
                    best_split["dataset_right"] = dataset_right
                    best_split["info_gain"] = curr_info_gain
                    max_info_gain = curr_info_gain

        
    #reurn best split
    return best_split
        