<a href="https://colab.research.google.com/github/snehachavhan2004/MLfromscratch/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

In [None]:
# Sample data
X = np.array([[0, 0], [1, 1], [1, 0], [0, 1], [1, 1], [0, 0]])
y = np.array([0, 1, 1, 0, 1, 0])

In [None]:
class DecisionTree:
    def __init__(self, depth=0, max_depth=3):
        self.depth = depth
        self.max_depth = max_depth
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = None

    def fit(self, X, y):
        if len(set(y)) == 1 or self.depth >= self.max_depth:
            self.value = self._most_common_label(y)
            return

        num_samples, num_features = X.shape
        best_gain = -1

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature_index, threshold)
                if gain > best_gain:
                    best_gain = gain
                    self.feature_index = feature_index
                    self.threshold = threshold

        if best_gain == -1:
            self.value = self._most_common_label(y)
            return

        left_indices = X[:, self.feature_index] < self.threshold
        right_indices = X[:, self.feature_index] >= self.threshold
        self.left = DecisionTree(self.depth + 1, self.max_depth)
        self.right = DecisionTree(self.depth + 1, self.max_depth)
        self.left.fit(X[left_indices], y[left_indices])
        self.right.fit(X[right_indices], y[right_indices])

    def _information_gain(self, X, y, feature_index, threshold):
        parent_entropy = self._entropy(y)
        left_indices = X[:, feature_index] < threshold
        right_indices = X[:, feature_index] >= threshold
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0
        num_left, num_right = sum(left_indices), sum(right_indices)
        num_total = num_left + num_right
        left_entropy = self._entropy(y[left_indices])
        right_entropy = self._entropy(y[right_indices])
        child_entropy = (num_left / num_total) * left_entropy + (num_right / num_total) * right_entropy
        return parent_entropy - child_entropy

    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        return -np.sum([p * np.log2(p) for p in proportions if p > 0])

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        if self.value is not None:
            return self.value
        feature_value = X[self.feature_index]
        if feature_value < self.threshold:
            return self.left.predict(X)
        else:
            return self.right.predict(X)

    def print_tree(self, indent=""):
        if self.value is not None:
            print(f"{indent}Predict: {self.value}")
        else:
            print(f"{indent}X[{self.feature_index}] < {self.threshold}")
            self.left.print_tree(indent + "  ")
            self.right.print_tree(indent + "  ")

# Create and train the decision tree
tree = DecisionTree(max_depth=3)
tree.fit(X, y)
tree.print_tree()

# Predict the class for a new data point
new_point = np.array([0, 0])
prediction = tree.predict(new_point)
print("Predicted class for new point:", prediction)

X[0] < 1
  Predict: 0
  Predict: 1
Predicted class for new point: 0


In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Fit the tree on the training data
tree.fit(X_train, y_train)

# Predict the class for the testing data
y_pred = np.array([tree.predict(x) for x in X_test])

# Calculate and print accuracy metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
