In [1]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# -----------------------
# Gini Index and Gini Gain
# -----------------------

def gini(y):
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    return 1 - sum(p ** 2 for p in probabilities)

def gini_gain(y, x_column, threshold):
    left_mask = x_column <= threshold
    right_mask = ~left_mask

    if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
        return 0

    n = len(y)
    g_left = gini(y[left_mask])
    g_right = gini(y[right_mask])
    weighted_gini = (np.sum(left_mask) / n) * g_left + (np.sum(right_mask) / n) * g_right

    return 1 - weighted_gini

# -----------------------
# Decision Tree (Gini-based)
# -----------------------

class DecisionTreeGini:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.tree = None

    def best_split(self, X, y):
        best_gain = -1
        best_feature, best_threshold = None, None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                gain = gini_gain(y, X[:, feature], threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold

    def build_tree(self, X, y, depth=0):
        if len(set(y)) == 1 or depth >= self.max_depth:
            return Counter(y).most_common(1)[0][0]

        feature, threshold = self.best_split(X, y)
        if feature is None:
            return Counter(y).most_common(1)[0][0]

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left = self.build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self.build_tree(X[right_mask], y[right_mask], depth + 1)

        return (feature, threshold, left, right)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_one(self, x, node):
        if not isinstance(node, tuple):
            return node
        feature, threshold, left, right = node
        if x[feature] <= threshold:
            return self.predict_one(x, left)
        else:
            return self.predict_one(x, right)

    def predict(self, X):
        return [self.predict_one(x, self.tree) for x in X]

# -----------------------
# Run on Iris (binary)
# -----------------------

iris = load_iris()
X, y = iris.data, iris.target
X = X[y != 2]
y = y[y != 2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tree_gini = DecisionTreeGini(max_depth=3)
tree_gini.fit(X_train, y_train)
y_pred_gini = tree_gini.predict(X_test)

print("Gini-based Decision Tree Accuracy:", accuracy_score(y_test, y_pred_gini))


Gini-based Decision Tree Accuracy: 1.0
