#Build a decision tree algo 
we can build the decision tree from 
* Gini index 
* Information gain 


In [79]:
import pandas as pd
import numpy as np
import matplotlib as plt
df=pd.read_csv("spindle.csv")
fruit=pd.DataFrame(df)
fruit

Unnamed: 0,Colour,Diameter,Lablel
0,Green,3,apple
1,yellow,3,apple
2,red,1,grape
3,red,1,grape
4,yellow,3,lemon


In [80]:
#convert nonnumeric to numeric data
#make a dictionary for the output value 
d={'apple':0,'grape':1,'lemon':2}
fruit['Lablel']=fruit['Lablel'].map(d)
c={'Green':0,'yellow':1,'red':2}
fruit['Colour']=fruit['Colour'].map(c)

In [81]:
fruit.dtypes

Colour      int64
Diameter    int64
Lablel      int64
dtype: object

In [82]:
fruit

Unnamed: 0,Colour,Diameter,Lablel
0,0,3,0
1,1,3,0
2,2,1,1
3,2,1,1
4,1,3,2


In [83]:
#this will be the trainnig data
training_data=[["green",3,"apple"],
               ["yellow",3,"apple"],
               ["red",1,"grape"],
               ["red",1,"grape"],
               ["yellow",3,"lemon"]]

In [84]:
#the column lable for the tabel will be 
column=["colour","diameter","label"]


In [85]:
#this is a function to determinne the count of each target column
def count(rows):
  counts={}     #declare an empty dictionary to store the value 
  for row in rows:
    label=row[-1]
    if label not in counts:
      counts[label]=0
    counts[label]+=1
  return counts
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [86]:
#this is to create a question for the desicion node 
#a function is writen for our undersatanding format
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):         #this function is for our undersatanding 
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            column[self.column], condition, str(self.value))

In [87]:
#this is to create the right and left of the tree 
#right side of the root is represented as true_row
#left side of the root is represneted as false_row
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [88]:
#function to find the gini index 
#formula for gini : 1-((p/n)2-(p/n)2)             where n is the total number in the data set 
def gini(rows):
  counts=count(rows)
  impurity = 1
  for lbl in counts:
       prob_of_lbl = counts[lbl] / float(len(rows))
       impurity -= prob_of_lbl**2
  return impurity

In [89]:
#the we have to find the gain so 
#gain=(p/n)*calculated gini + (p/n)*calculated gini
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [90]:
def find_best_split(rows):
    best_gain = 0  
    best_question = None 
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  
    for col in range(n_features):  
        values = set([row[col] for row in rows]) 
        for val in values: 
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain >= best_gain:
                best_gain, best_question = gain, question
    return best_gain, best_question

#This part is for the construction of the tree


In [91]:
class Leaf:
    def __init__(self, rows):
        self.predictions = count(rows)

In [92]:
class Decision_Node:
    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [93]:
#This is to build a tree 
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

In [94]:
#This is to print in the form of tree
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):      #this is to check the wether it is a leaf node 
        print (spacing + "Predict", node.predictions)
        return
    print (spacing + str(node.question))
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [95]:
friut_classifier = build_tree(training_data)

In [96]:
print_tree(friut_classifier)

Is diameter >= 3?
--> True:
  Is colour == yellow?
  --> True:
    Predict {'apple': 1, 'lemon': 1}
  --> False:
    Predict {'apple': 1}
--> False:
  Predict {'grape': 2}
