In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
iris = datasets.load_iris() ## load the data from iris datasets
x = iris.data
y = iris.target
iris.feature_names
iris.target_names ## means that we have to classify this into three group
type(iris)

sklearn.utils.Bunch

In [3]:
data = pd.DataFrame(x)
features = iris.feature_names
data.columns = features
data['Output'] = y  ##put the value of output in the last column
Name = iris.target_names
data.isnull().sum() ## this show that our data is clean, their is no  any null value is there
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Output
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [4]:
## it takes the y(output) as input and gives the entropy
def Entropy(y):
    ent = 0.0
    N = float(y.shape[0])
    count = np.unique(y,return_counts = True)
    ## count return me to array first one consist of all the no which are present 
    ## second one consists of their frequencies
    for i in count[1]:
        p = i/N
        ent += (-1.0)*(p)*np.log2(p)
    return ent

In [5]:
## Divide function- it split into two parts one which has value less than value 
## and another part which has more value
def divide_data(x_train,f,val):
    x_left = pd.DataFrame([],columns = x_train.columns)
    x_right = pd.DataFrame([],columns = x_train.columns)
    for i in range(x_train.shape[0]):
        temp = x_train[f].loc[i] ## we find the value of that feature column and 
        ## if that value is less than val,then put it into left side otherwise put it into right side
        if temp < val:
            x_left = x_left.append(x_train.loc[i])
        else:
            x_right = x_right.append(x_train.loc[i])
    return x_left,x_right

In [6]:
## this function gives me the no of count in each classes 

def find_count(x_train):
    count = []
    count.append(x_train[x_train['Output'] == 0].shape[0])
    count.append(x_train[x_train['Output'] == 1].shape[0])
    count.append(x_train[x_train['Output'] == 2].shape[0])
    return count

In [7]:
## it takes the whole dataframe,feature and value around which we are going to split
## and calculate the value of gain
def gain(x_train,f,val):
    left,right = divide_data(x_train,f,val)
    l = float(left.shape[0])/x_train.shape[0] ## calculating the weightage
    r = float(right.shape[0])/x_train.shape[0]
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1000000 ## we return the minimum value so that we are not take this feature
    ## because if all the element comes to any side then there is no meaning of splitting the data
    ans = Entropy(x_train.Output) - (l*Entropy(left.Output) + r*Entropy(right.Output))
    return ans

In [18]:
class DecisionTree:
    ## constructor
    def __init__(self,depth = 0,max_depth = 10):
        self.left = None
        self.right = None
        self.f = None
        self.val = None
        self.count = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
    
    ## define the train function
    def train(self,x_train,Name):
        features =  ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
        info_gain = []
        for i in features:
            i_gain = gain(x_train,i,x_train[i].mean())
            info_gain.append(i_gain)
        self.f = features[np.argmax(info_gain)] ## maximum info gain part
        self.val = x_train[self.f].mean() ## mean of the maximum info part
        print("Level ",self.depth)
        cnt = 0     ## for knowing that how many elements in our node or it's pure node
        self.count = find_count(x_train) ## give the value of count in each class
        for i in range(len(self.count)):
            if(self.count[i]):   ## count of envery class which is not zero at that praticular node
                print("Count of ",Name[i]," = ",self.count[i])
                cnt += 1
        print("Current Entropy = ",Entropy(x_train.Output))
        if cnt != 1:
            print("Splitting on feature ",self.f," with information gain ",np.argmax(info_gain))
    
    
        ## split the data
        x_left,x_right = divide_data(x_train,self.f,self.val)
        x_left = x_left.reset_index(drop = True)## reset the index of dataframe
        x_right = x_right.reset_index(drop = True)
    
        ## base case 
        ## reach the leaf Node
        if cnt == 1:
            if x_train.Output.mean() >= 1.5:
                self.target = Name[2]
            elif x_train.Output.mean() <= 0.5:
                self.target = Name[0]
            else:
                self.target = Name[1]
            print("Reached Leaf Node")
            print()
            print()
            return
        
        
        ## check for the depth 
        ## if we want to see only the upper part of the tree
        if (self.depth >= self.max_depth):
            if x_train.Output.mean() >= 1.5:
                self.target = Name[2]
            elif x_train.Output.mean() <= 0.5:
                self.target = Name[0]
            else:
                self.target = Name[1]
            print("Max Depth Reached")
            print()
            print()
            return
        print()
        print()
        
        ## Do the recursive call 
        self.left = DecisionTree(depth = self.depth + 1,max_depth = self.max_depth)
        self.left.train(x_left,Name)
        self.right = DecisionTree(depth = self.depth + 1,max_depth = self.max_depth)
        self.right.train(x_right,Name)
     
        
        ## set the target at every Node
        if x_train.Output.mean() >= 1.5:
            self.target = Name[2]
        elif x_train.Output.mean() <= 0.5:
            self.target = Name[0]
        else:
            self.target = Name[1]
        
        print()
        print()
        return

In [19]:
dt = DecisionTree()  ## making object

In [20]:
dt.train(data,Name)

Level  0
Count of  setosa  =  50
Count of  versicolor  =  50
Count of  virginica  =  50
Current Entropy =  1.584962500721156
Splitting on feature  petal length (cm)  with information gain  2


Level  1
Count of  setosa  =  50
Count of  versicolor  =  7
Current Entropy =  0.5373760853377336
Splitting on feature  petal length (cm)  with information gain  2


Level  2
Count of  setosa  =  48
Current Entropy =  0.0
Reached Leaf Node


Level  2
Count of  setosa  =  2
Count of  versicolor  =  7
Current Entropy =  0.7642045065086203
Splitting on feature  petal width (cm)  with information gain  3


Level  3
Count of  setosa  =  2
Current Entropy =  0.0
Reached Leaf Node


Level  3
Count of  versicolor  =  7
Current Entropy =  0.0
Reached Leaf Node






Level  1
Count of  versicolor  =  43
Count of  virginica  =  50
Current Entropy =  0.9959094138937685
Splitting on feature  petal width (cm)  with information gain  3


Level  2
Count of  versicolor  =  42
Count of  virginica  =  5
Current Ent