<a href="https://colab.research.google.com/github/sharavana07/ACE_Hack/blob/main/ex_8_CART_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

# Function to calculate Gini Impurity
def gini_impurity(labels):
    classes, counts = np.unique(labels, return_counts=True)
    impurity = 1 - sum((count/len(labels))**2 for count in counts)
    return impurity

# Function to split dataset
def split_dataset(data, labels, feature_index, threshold):
    left_mask = data[:, feature_index] <= threshold
    right_mask = ~left_mask
    return data[left_mask], labels[left_mask], data[right_mask], labels[right_mask]

# Function to find the best split
def best_split(data, labels):
    best_gini, best_feature, best_threshold = float('inf'), None, None
    num_samples, num_features = data.shape

    for feature in range(num_features):
        thresholds = np.unique(data[:, feature])
        for threshold in thresholds:
            left_data, left_labels, right_data, right_labels = split_dataset(data, labels, feature, threshold)
            if len(left_labels) == 0 or len(right_labels) == 0:
                continue
            gini = (len(left_labels) / len(labels)) * gini_impurity(left_labels) + (len(right_labels) / len(labels)) * gini_impurity(right_labels)
            if gini < best_gini:
                best_gini, best_feature, best_threshold = gini, feature, threshold
    return best_feature, best_threshold

# Decision Tree node class
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Recursive function to build the decision tree
def build_tree(data, labels, depth=0, max_depth=5):
    if len(set(labels)) == 1:
        return TreeNode(value=labels[0])

    if depth >= max_depth:
        majority_class = Counter(labels).most_common(1)[0][0]
        return TreeNode(value=majority_class)

    feature, threshold = best_split(data, labels)
    if feature is None:
        majority_class = Counter(labels).most_common(1)[0][0]
        return TreeNode(value=majority_class)

    left_data, left_labels, right_data, right_labels = split_dataset(data, labels, feature, threshold)
    left_child = build_tree(left_data, left_labels, depth + 1, max_depth)
    right_child = build_tree(right_data, right_labels, depth + 1, max_depth)
    return TreeNode(feature, threshold, left_child, right_child)

# Function to predict using the decision tree
def predict_single(tree, sample):
    if tree.value is not None:
        return tree.value
    feature_value = sample[tree.feature]
    if feature_value <= tree.threshold:
        return predict_single(tree.left, sample)
    else:
        return predict_single(tree.right, sample)

def predict(tree, data):
    return np.array([predict_single(tree, sample) for sample in data])


In [2]:

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Build the CART model
cart_tree = build_tree(X_train, y_train, max_depth=5)

# Predict and calculate accuracy
y_pred = predict(cart_tree, X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy of the CART model:", accuracy)


Accuracy of the CART model: 0.9555555555555556
