<a href="https://colab.research.google.com/github/sagsarkar/ML_Coding/blob/main/decision_tree_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.datasets import make_blobs

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# shape of X = (num_data_points, num_features); shape of y = (num_data_points,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)


# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42, max_depth=10, criterion="entropy")

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9473684210526315


In [3]:
class TreeNode:
  def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
    self.feature_idx = feature_idx
    self.threshold = threshold
    self.left = left
    self.right = right
    self.value = value

In [4]:
class DecisionTree:
  def __init__(self, min_samples=2, max_depth=10):
    self.min_samples = min_samples
    self.max_depth = max_depth

  def _entropy(self, y):
    class_frequencies = Counter(y)
    probabilities = []
    tot = 0
    for cl in class_frequencies.keys():
      count = class_frequencies[cl]
      tot += count
      probabilities.append(count)
    probabilities = np.array(probabilities)/tot

    entropy = sum([-p*np.log2(p) for p in probabilities])

    return entropy

  def _information_gain(self, feature, threshold, y):
    parent_entropy = self._entropy(y)

    left_indices = feature <= threshold
    right_indices = ~left_indices

    left_y = y[left_indices]
    right_y = y[right_indices]

    left_entropy = self._entropy(left_y)
    right_entropy = self._entropy(right_y)

    left_weight = len(left_y)/len(y)
    right_weight = len(right_y)/len(y)

    new_entropy = (left_weight*left_entropy) + (right_weight*right_entropy)

    information_gain = parent_entropy - new_entropy

    return information_gain

  def _majority_class(self, y):
    class_frequencies = Counter(y)
    highest_frequency = 0
    top_class = None
    for cl in class_frequencies.keys():
      count = class_frequencies[cl]
      if count > highest_frequency:
        highest_frequency = count
        top_class = cl

    return top_class

  def fit(self, X, y):
    self.num_features = X.shape[1]
    self.root = self._grow_tree(X, y)

  def _grow_tree(self, X, y, depth=0):
    if depth > self.max_depth or len(y) < self.min_samples:
      return TreeNode(value = self._majority_class(y))

    best_gain = 0
    best_split = None

    for feature_idx in range(self.num_features):
      feature = X[:,feature_idx]
      for threshold in feature:
        gain = self._information_gain(feature, threshold, y)
        if gain > best_gain:
          best_gain = gain
          best_split = (feature_idx, threshold)

    if best_gain == 0:
      return TreeNode(value = self._majority_class(y))

    left_indices = X[:,best_split[0]] <= best_split[1]
    right_indices = ~left_indices

    left = self._grow_tree(X[left_indices], y[left_indices], depth+1)
    right = self._grow_tree(X[right_indices], y[right_indices], depth+1)

    return TreeNode(feature_idx=best_split[0], threshold=best_split[1], left=left, right=right)

  def _traverse_tree(self, x):
    node = self.root
    while node.value == None:
      feature_idx, threshold = node.feature_idx, node.threshold
      if x[feature_idx] <= threshold:
        node = node.left
      else:
        node = node.right

    return node.value

  def predict(self, X):
    return np.array([self._traverse_tree(x) for x in X])



In [5]:
clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train)

In [6]:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Accuracy: ",acc)

Accuracy:  0.9298245614035088
