In [9]:
import pandas as pd
import math
from collections import deque

In [None]:
data_path = "min_max_normalized_data.csv"
df = pd.read_csv(data_path)
classes = df['Target'].unique()
df.info()

Decesion Tree From scratch

In [1]:
def entropy(df,target):

    total = len(df) 
    
    # claculating the probabilities pi 
    probabilities = df[target].value_counts(normalize=True)  # this equivalent to count / total

    # Calculate entropy
    entropy = -sum(pi * math.log2(pi) for pi in probabilities if pi > 0)

    return entropy 

In [3]:
def gain(df,total_entropy,attribute,target):
    
    total = len(df)

    # Group by attribute values
    grouped = df.groupby(attribute)

    weighted_entropy = sum(
        (len(group) / total) * entropy(group, target)
        for _, group in grouped
    )
    # Calculate information gain
    info_gain = total_entropy - weighted_entropy

    return info_gain

In [16]:
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast','Sunny', 'Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High','Normal', 'Normal', 'Normal','High',  'Normal', 'High'],
    'Wind': ['Light', 'Strong', 'Light', 'Light', 'Light', 'Strong', 'Strong', 'Light', 'Light', 'Light', 'Strong', 'Strong','Light', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes','No', 'Yes','Yes', 'Yes', 'Yes','Yes', 'No']
}
df = pd.DataFrame(data)
attributes = ['Outlook', 'Temperature', 'Humidity', 'Wind']
target = 'PlayTennis'

# Calculate entropy of the target column
total_entropy = entropy(df, 'PlayTennis')
print("Total entropy:", total_entropy)

# Calculate gain for the attribute 'Outlook'
info_gain = gain(df, total_entropy, 'Outlook', 'PlayTennis')
print("Information Gain for Outlook:", info_gain)

Total entropy: 0.9402859586706311
Information Gain for Outlook: 0.24674981977443933


In [7]:
def select_attribute(df,total_entropy,attributes,target):

    info_gains = [(attribute,gain(df, total_entropy, attribute, target)) for attribute in attributes]

    # Select the attribute with the maximum information gain
    best_attribute = max(info_gains, key=lambda x: x[1])[0]  # Extract the attribute name

    return best_attribute

In [8]:
best_attribute = select_attribute(df, total_entropy, attributes, 'PlayTennis')
print("Best attribute:", best_attribute)

Best attribute: Outlook


In [11]:
class Node:
    def __init__(self, data):
        self.data = data
        self.children = []

In [12]:
def add_child(parentNode, node):
    parentNode.children.append(node)

In [18]:
def decision_tree_algorithm(df, attributes, target, root=None, parentNode=None):
    
    # Stopping condition: if no attributes left or all target values are the same
    if not attributes or len(df[target].unique()) == 1:
        # Create a leaf node
        leaf_value = df[target].mode()[0]  # Most common target value
        leaf_node = Node(leaf_value)
        if parentNode:  # Attach leaf to the parent
            add_child(parentNode, leaf_node)
        return root or leaf_node

    # Calculate total entropy
    total_entropy = entropy(df, target)

    # Select the best attribute to split on
    best_attribute = select_attribute(df, total_entropy, attributes, target)

    # Create the root node if it doesn't exist
    if root is None:
        root = Node(best_attribute)
        parentNode = root

    # Partition the DataFrame by the best attribute
    partitions = df.groupby(best_attribute)

    # Recurse on each partition
    for value, partition in partitions:
        # Create a child node for the attribute value
        node = Node(f"{best_attribute}={value}")
        add_child(parentNode, node)

        # Filter out the used attribute
        remaining_attributes = [attr for attr in attributes if attr != best_attribute]

        # Recursively build the tree
        decision_tree_algorithm(partition, remaining_attributes, target, root, node)

    return root

In [14]:
def print_tree(node, depth=0):
    print("  " * depth + str(node.data))
    for child in node.children:
        print_tree(child, depth + 1)

In [19]:
root = decision_tree_algorithm(df, attributes, target)
print_tree(root)

Outlook
  Outlook=Overcast
    Yes
  Outlook=Rain
    Wind=Light
      Yes
    Wind=Strong
      No
  Outlook=Sunny
    Humidity=High
      No
    Humidity=Normal
      Yes
