In [60]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-for-decision-tree-classification/Purchase_new.csv
/kaggle/input/mushroom-classification/mushrooms.csv


In [38]:
from math import log2

In [39]:
data={'Alt':['T','T','F','T','T','F','F','F','F','T','F','T'],'Bar':['F','F','T','F','F','T','T','F','T','T','F','T'],'Fir': ['F','F','F','T','T','F','F','F','T','T','F','T'],'Hun':['T','T','F','T','F','T','F','T','F','T','F','T'],'Pat':['Some','Full','Some','Full','Full','Some','None','Some','Full','Full','None','Full'],'Price':['$$$','$','$','$','$$$','$$','$','$$','$','$$$','$','$'],'Rain':['F','F','F','F','F','T','T','T','T','F','F','F'],'Res':['T','F','F','F','T','T','F','T','F','T','F','F'],'Type':['French','Thai','Burger','Thai','French','Italian','Burger','Thai','Burger','Italian','Thai','Burger'],'Est':['0-10','30-60','0-10','10-30','>60','0-10','0-10','0-10','>60','10-30','0-10','30-60'],'Wait':['T','F','T','T','F','T','F','T','F','F','F','T']}


In [40]:
df=pd.DataFrame(data)
df

Unnamed: 0,Alt,Bar,Fir,Hun,Pat,Price,Rain,Res,Type,Est,Wait
0,T,F,F,T,Some,$$$,F,T,French,0-10,T
1,T,F,F,T,Full,$,F,F,Thai,30-60,F
2,F,T,F,F,Some,$,F,F,Burger,0-10,T
3,T,F,T,T,Full,$,F,F,Thai,10-30,T
4,T,F,T,F,Full,$$$,F,T,French,>60,F
5,F,T,F,T,Some,$$,T,T,Italian,0-10,T
6,F,T,F,F,,$,T,F,Burger,0-10,F
7,F,F,F,T,Some,$$,T,T,Thai,0-10,T
8,F,T,T,F,Full,$,T,F,Burger,>60,F
9,T,T,T,T,Full,$$$,F,T,Italian,10-30,F


In [41]:

#entropy calculation of an attribute
#entropy corresponds to the randomness
def entropy(attribute):
    #print(attribute)
    elements,count=np.unique(attribute,return_counts=True)
    length=len(elements)
    #print(length)
    total_count=0
    for i in range(length):
        total_count+=count[i]
    
    entropyy=0
    for i in range(length):
        p=count[i]/total_count
        entropyy+=(-1*p)*log2(p)
        
    return entropyy


In [42]:
print(entropy(df['Wait'])) #log base 2 remember

1.0


In [43]:


def InfoGain(data,split_attribute,target_name):
    total_entropy=entropy(data[target_name])
    vals,counts=np.unique(data[split_attribute],return_counts=True)
    target_vals,target_counts=np.unique(data[target_name],return_counts=True)
    num_attr=len(vals)
    target_num_attr=len(target_vals)
    weighted_entropy=0
    for i in range(num_attr):
        data_subset=data[data[split_attribute]==vals[i]] #as we right now want to deal with only one type of the particular attribute
        subset_entropy=0
        for j in range(target_num_attr):
            count_target_value=len(data_subset[data_subset[target_name]==target_vals[j]])
            p_target_value=count_target_value/counts[i] 
            if p_target_value>0:
                subset_entropy-=p_target_value*log2(p_target_value)
                
        weighted_entropy+=(counts[i]/np.sum(counts))*subset_entropy
        
    information_gain=total_entropy-weighted_entropy
    return information_gain



In [44]:

#choose the best attribute as a root for splitting further
def choose_best_attribute(data,attributes,target_attribute):
    information_gains=[]
    for attr in attributes:
        information_gains.append(InfoGain(data,attr,target_attribute))
        
    best_attr_index=np.argmax(information_gains) #the one with maximum information gain is the root attribute chosen
    return attributes[best_attr_index]


In [45]:

best_attribute=choose_best_attribute(df,df.columns[:-1],'Wait')
print(f"The best attribute to split on is: {best_attribute}")


The best attribute to split on is: Pat


In [46]:
class DecisionTree:
    def __init__(self,attribute):
        self.attribute=attribute
        self.children={}
        self.is_leaf=False
        self.classification=None
        
    def add_child(self,val,node):
        self.children[val]=node
        

In [54]:

def build_tree(data,df,attributes,target_attr):
    #base case, if leaf node is reached -- all target values now have same values
    #print(type(len(np.unique(data[target_attr]))))
    #print(type(1))
    if len(np.unique(data[target_attr])) <= 1: 
        #when in the data, all the target values have the same values, that means we have classified so stop
        leaf_node=DecisionTree(None)
        leaf_node.is_leaf=True
        leaf_node.classification=data[target_attr].iloc[0]
        return leaf_node
    
    elif len(data)==0 or len(attributes)==0:
        leaf_node=DecisionTree(None)
        leaf_node.is_leaf=True
        leaf_node.classification = np.unique(df[target_attr])[np.argmax(np.unique(df[target_attr], return_counts=True)[1])]
        return leaf_node
    
    else:
        best_attr=choose_best_attribute(data,attributes,target_attr)
        tree=DecisionTree(best_attr) #the best attribute chosen is now the root of the tree
        attributes=[attr for attr in attributes if attr!=best_attr] #all the remaining attributes
        
        for val in np.unique(data[best_attr]):
            sub_data = data[data[best_attr] == val].reset_index(drop=True) # Correctly create a sub-dataset for seperate types in the chosen best attribute
            subtree=build_tree(sub_data,df,attributes,target_attr) #build tree further for this data set created with the attributes apart from the best attr chosen
            tree.add_child(val,subtree)
            
        return tree
        

In [55]:

def print_tree(node, level=0, parent_attr=None, parent_value=None):
    indent = " " * (level * 4)  # Using 4 spaces for each level of indentation
    if parent_attr is not None and parent_value is not None:
        print(f"{indent}[{parent_attr} = {parent_value}]")
    if node.is_leaf:
        print(f"{indent}-> Classification: {node.classification}")
    else:
        print(f"{indent}Node: {node.attribute}")
        for value, child in node.children.items():
            print_tree(child, level+1, parent_attr=node.attribute, parent_value=value)
            
attributes=list(df.columns[:-1])
print(attributes)
decision_tree=build_tree(df,df,attributes,"Wait")
print_tree(decision_tree)



['Alt', 'Bar', 'Fir', 'Hun', 'Pat', 'Price', 'Rain', 'Res', 'Type', 'Est']
Node: Pat
    [Pat = Full]
    Node: Hun
        [Hun = F]
        -> Classification: F
        [Hun = T]
        Node: Type
            [Type = Burger]
            -> Classification: T
            [Type = Italian]
            -> Classification: F
            [Type = Thai]
            Node: Fir
                [Fir = F]
                -> Classification: F
                [Fir = T]
                -> Classification: T
    [Pat = None]
    -> Classification: F
    [Pat = Some]
    -> Classification: T


In [59]:
file_path="/kaggle/input/dataset-for-decision-tree-classification/Purchase_new.csv"
org_df=pd.read_csv(file_path)
print(org_df)

   Holiday Discount Free Delivery Purchase
0       No      Yes           Yes      Yes
1       No      Yes           Yes      Yes
2       No       No            No       No
3      yes      Yes           Yes      Yes
4      yes      Yes           Yes      Yes
5      yes       No            No       No
6      yes      Yes            No      Yes
7       No      Yes           Yes      Yes
8      yes      Yes           Yes      Yes
9      yes      Yes           Yes      Yes
10     yes       No           Yes      Yes
11     yes       No            No       No
12     yes      Yes           Yes      Yes
13     yes      Yes           Yes      Yes
14     yes      Yes           Yes      Yes
15      No      Yes           Yes      Yes
16     yes       No           Yes      Yes
17      No      Yes            No      Yes
18     yes       No            No      Yes
19     yes       No           Yes      Yes
20      No      Yes           Yes      Yes
21     yes      Yes           Yes       No
22     yes 

In [58]:
#Test Real world Dataset 1 -> org_df
#whether or not you should purchase an item or not

#print(org_df["Purchase"])
attributes=list(org_df.columns[:-1])
print(attributes)
#print(org_df["Purchase"])
decision_tree=build_tree(org_df,org_df,attributes,"Purchase")
print_tree(decision_tree)

['Holiday', 'Discount', 'Free Delivery']
Node: Discount
    [Discount = No]
    Node: Free Delivery
        [Free Delivery = No]
        Node: Holiday
            [Holiday = No]
            -> Classification: No
            [Holiday = yes]
            -> Classification: Yes
        [Free Delivery = Yes]
        Node: Holiday
            [Holiday = No]
            -> Classification: No
            [Holiday = yes]
            -> Classification: Yes
    [Discount = Yes]
    Node: Holiday
        [Holiday = No]
        -> Classification: Yes
        [Holiday = yes]
        Node: Free Delivery
            [Free Delivery = No]
            -> Classification: Yes
            [Free Delivery = Yes]
            -> Classification: Yes


In [61]:
file_path_2="/kaggle/input/mushroom-classification/mushrooms.csv"
mush_df=pd.read_csv(file_path_2)
print(mush_df)


     class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0        p         x           s         n       t    p               f   
1        e         x           s         y       t    a               f   
2        e         b           s         w       t    l               f   
3        p         x           y         w       t    p               f   
4        e         x           s         g       f    n               f   
...    ...       ...         ...       ...     ...  ...             ...   
8119     e         k           s         n       f    n               a   
8120     e         x           s         n       f    n               a   
8121     e         f           s         n       f    n               a   
8122     p         k           y         n       f    y               f   
8123     e         x           s         n       f    n               a   

     gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0               c         n

In [62]:
mush_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [67]:
#Test Real world Dataset 2 -> mush_df
#classifying whether a mushroom is safe to eat or deadly

attributes2=list(mush_df.columns[1:])
print(attributes2)
#print(org_df["Purchase"])
decision_tree2=build_tree(mush_df,mush_df,attributes2,"class")
print_tree(decision_tree2)

['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
Node: odor
    [odor = a]
    -> Classification: e
    [odor = c]
    -> Classification: p
    [odor = f]
    -> Classification: p
    [odor = l]
    -> Classification: e
    [odor = m]
    -> Classification: p
    [odor = n]
    Node: spore-print-color
        [spore-print-color = b]
        -> Classification: e
        [spore-print-color = h]
        -> Classification: e
        [spore-print-color = k]
        -> Classification: e
        [spore-print-color = n]
        -> Classification: e
        [spore-print-color = o]
        -> Classification: e
        [spore-print-color = r]
        -> Classification: p
        [spore-prin