<a href="https://colab.research.google.com/github/ronakraj00/lab_ml/blob/main/decision_tree_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
from collections import Counter
import pandas as pd

def entropy(y):
    counts = Counter(y)
    total = len(y)
    return -sum((count / total) * np.log2(count / total) for count in counts.values())


def split_data(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = X[:, feature_index] > threshold
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]


def information_gain(X, y, feature_index, threshold):
    parent_entropy = entropy(y)
    left_X, left_y, right_X, right_y = split_data(X, y, feature_index, threshold)
    if len(left_y) == 0 or len(right_y) == 0:
        return 0
    n = len(y)
    left_weight = len(left_y) / n
    right_weight = len(right_y) / n
    child_entropy = left_weight * entropy(left_y) + right_weight * entropy(right_y)
    return parent_entropy - child_entropy


def best_split(X, y):
    best_gain = 0
    best_feature, best_threshold = None, None
    for feature_index in range(X.shape[1]):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            gain = information_gain(X, y, feature_index, threshold)
            if gain > best_gain:
                best_gain, best_feature, best_threshold = gain, feature_index, threshold
    return best_feature, best_threshold


class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def build_tree(X, y, depth=0, max_depth=3):
    if len(set(y)) == 1:
        return TreeNode(value=y[0])
    if depth == max_depth:
        most_common_label = Counter(y).most_common(1)[0][0]
        return TreeNode(value=most_common_label)

    feature, threshold = best_split(X, y)
    if feature is None:
        return TreeNode(value=Counter(y).most_common(1)[0][0])

    left_X, left_y, right_X, right_y = split_data(X, y, feature, threshold)
    left_child = build_tree(left_X, left_y, depth + 1, max_depth)
    right_child = build_tree(right_X, right_y, depth + 1, max_depth)

    return TreeNode(feature=feature, threshold=threshold, left=left_child, right=right_child)

def predict(tree, x):
    if tree.value is not None:
        return tree.value
    if x[tree.feature] <= tree.threshold:
        return predict(tree.left, x)
    else:
        return predict(tree.right, x)


def predict_all(tree, X):
    return [predict(tree, x) for x in X]

data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 91],
    'Wind': [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1],
    'PlayTennis': [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]
}
df = pd.DataFrame(data)
X = df[['Temperature', 'Humidity', 'Wind']].values
y = df['PlayTennis'].values

tree = build_tree(X, y, max_depth=3)

predictions = predict_all(tree, X)
print("Predictions:", predictions)
print("Actual Labels:", list(y))


Predictions: [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0]
Actual Labels: [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]
