<a href="https://colab.research.google.com/github/sathvik-ujwal/Lab-Codes/blob/main/MACHINE%20LEARNING/Codes/C4_5%26CART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, output=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.output = output

In [3]:
def calculate_entropy(y):
    if len(y) == 0:
        return 0
    probs = np.bincount(y) / len(y)
    return -np.sum([p * np.log(p) for p in probs if p > 0])

In [5]:
def calculate_gini(y):
    if len(y) == 0:
        return 0
    probs = np.bincount(y) / len(y)
    return 1 - np.sum(probs**2)


In [9]:
def split_data(x, y, feature, threshold):
    left_indices = x[:, feature] == threshold
    right_indices = ~left_indices
    return x[left_indices], y[left_indices], x[right_indices], y[right_indices]

In [8]:
def choose_best_feature(x, y, criterion="entropy"):
    best_gain = -1
    best_feature = -1
    best_threshold = None
    base_impurity = calculate_entropy(y) if criterion == "entropy" else calculate_gini(y)

    for feature in range(x.shape[1]):
        thresholds = np.unique(x[:, feature])

        for threshold in thresholds:
            x_left, y_left, x_right, y_right = split_data(x, y, feature, threshold)

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            if criterion == "entropy":
                gain = base_impurity - (len(y_left)/ len(y) * calculate_entropy(y_left) +
                                        len(y_right) / len(y) * calculate_entropy(y_right))
            else:
                gini = (len(y_left)/ len(y) * calculate_gini(y_left) +
                                        len(y_right) / len(y) * calculate_gini(y_right))
                gain = base_impurity - gini

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold


In [20]:
def build_tree(x, y, criterion="entropy"):
    if len(set(y)) == 1:
        return Node(output=y[0])

    best_feature, best_threshold = choose_best_feature(x, y, criterion)

    if best_feature == -1:
        return Node(output=np.random.choice(y))

    x_left, y_left, x_right, y_right = split_data(x, y, best_feature, best_threshold)

    left_node = build_tree(x_left, y_left, criterion)
    right_node = build_tree(x_right, y_right, criterion)

    return Node(feature=best_feature, threshold= best_threshold, left=left_node, right = right_node)


In [29]:
def predict(tree, sample):
    if tree.output is not None:
        return tree.output

    if sample[tree.feature] == tree.threshold:
        return predict(tree.left, sample)
    else:
        return predict(tree.right, sample)

In [13]:
def encode_features(x):
    encodings = {}

    for i in range(x.shape[1]):
        unique_vals = list(set(x[:, i]))
        encodings[i] = {val: idx for idx, val in enumerate(unique_vals)}
        x[:, i] = [encodings[i][val] for val in x[:, i]]

    return x, encodings

In [14]:
def classify(sample, encoders, tree):
    sample_encoded = [encoders[i][sample[i]] for i in range(len(sample))]
    return predict(tree, sample_encoded)


In [15]:
# C4.5 Decision Tree
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Decision
0,Sunny,85,85,Weak,No
1,Sunny,80,90,Strong,No
2,Overcast,83,78,Weak,Yes
3,Rain,70,96,Weak,Yes
4,Rain,68,80,Weak,Yes


In [30]:
x = df.iloc[:, :-1].values
y = np.array([1 if label == "Yes" else 0 for label in df['Decision']])

x, encoders = encode_features(x)

c45_tree = build_tree(x, y, criterion="entropy")

sample = ['Overcast', 83, 78, 'Weak']

results = classify(sample, encoders, c45_tree)
print(f"C4.5: {'Yes' if results == 1 else 'No'}")

C4.5: Yes
