In [2]:
import csv
row_list = [["ID", "Age", "Income", "Gender", "Marital Status", "Buys"],
             [1, '<21', 'High', 'M', 'S', 'No'],
             [2, '<21', 'High', 'M', 'M', 'No'],
             [3, '21-35', 'High', 'M', 'S', 'Yes'],
             [4, '>35', 'Medium', 'M', 'S', 'Yes'],
             [5, '>35', 'Low', 'F', 'S', 'Yes'],
             [6, '>35', 'Low', 'F', 'M', 'No'],
             [7, '21-35', 'Low', 'F', 'M', 'Yes'],
             [8, '<21', 'Medium', 'M', 'S', 'No'],
             [9, '<21', 'Low', 'F', 'M', 'Yes'],
             [10, '>35', 'Medium', 'F', 'S', 'Yes'],
             [11, '<21', 'Medium', 'F', 'M', 'Yes'],
             [12, '21-35', 'Medium', 'M', 'M', 'Yes'],
             [13, '21-35', 'High', 'F', 'S', 'Yes'],
             [14, '>35' , 'Medium', 'M', 'M', 'No'],
            
           ]
with open('DecisionTreeClassifier.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(row_list)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from numpy import log2 as log

In [2]:
df = pd.read_csv('DecisionTreeClassifier.csv')

In [3]:
df

Unnamed: 0,ID,Age,Income,Gender,Marital Status,Buys
0,1,<21,High,M,S,No
1,2,<21,High,M,M,No
2,3,21-35,High,M,S,Yes
3,4,>35,Medium,M,S,Yes
4,5,>35,Low,F,S,Yes
5,6,>35,Low,F,M,No
6,7,21-35,Low,F,M,Yes
7,8,<21,Medium,M,S,No
8,9,<21,Low,F,M,Yes
9,10,>35,Medium,F,S,Yes


In [4]:
eps = np.finfo(float).eps

In [5]:
df = df.drop('ID',axis=1)
df

Unnamed: 0,Age,Income,Gender,Marital Status,Buys
0,<21,High,M,S,No
1,<21,High,M,M,No
2,21-35,High,M,S,Yes
3,>35,Medium,M,S,Yes
4,>35,Low,F,S,Yes
5,>35,Low,F,M,No
6,21-35,Low,F,M,Yes
7,<21,Medium,M,S,No
8,<21,Low,F,M,Yes
9,>35,Medium,F,S,Yes


In [6]:
def find_entropy(df):
    Class = df.keys()[-1]   
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [7]:
ig = find_entropy(df)
ig

0.9402859586706311

In [8]:
def find_entropy_attribute(df,attribute):
    Class = df.keys()[-1]  
    target_variables = df[Class].unique()  
    variables = df[attribute].unique()    
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

In [14]:
entropy_age = find_entropy_attribute(df,'Age')
entropy_age
gain_age = ig - entropy_age
gain_age

0.24674981977443977

In [16]:
entropy_gender = find_entropy_attribute(df,'Gender')
gain_gender = ig - entropy_gender
gain_gender

0.15183550136234225

In [18]:
entropy_income = find_entropy_attribute(df,'Income')
gain_income = ig - entropy_income
gain_income

0.029222565658955535

In [19]:
entropy_MS = find_entropy_attribute(df,'Marital Status')
gainMS = ig- entropy_MS
gainMS

0.01611160637019049

In [21]:
def find_root(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
        #print("Entropy", key, find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [22]:
def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)

In [23]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]   
    
    
    node = find_root(df)
    
    attValue = np.unique(df[node])
    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
  
    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['Buys'],return_counts=True)                        
        
        if len(counts)==1:
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable)            
    return tree

In [24]:
tree = buildTree(df)

In [25]:
tree

{'Age': {'21-35': 'Yes',
  '<21': {'Gender': {'F': 'Yes', 'M': 'No'}},
  '>35': {'Marital Status': {'M': 'No', 'S': 'Yes'}}}}

In [26]:
def predict(inst,tree):

    for nodes in tree.keys():        
        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [27]:
inst = df.iloc[6]

In [28]:
inst

Age               21-35
Income              Low
Gender                F
Marital Status        M
Buys                Yes
Name: 6, dtype: object

In [29]:
Prediction = predict(inst,tree)

In [30]:
Prediction

'Yes'

In [31]:
data = {'Age':'<21','Income':'Low','Gender':'F','MaritalStatus':'M'}

In [32]:
data

{'Age': '<21', 'Income': 'Low', 'Gender': 'F', 'MaritalStatus': 'M'}

In [33]:
NewPrediction = predict(data,tree)

In [34]:
NewPrediction

'Yes'