In [40]:
import numpy as np
import pandas as pd
from collections import Counter

# Load the iris dataset
def load_iris():
    from sklearn.datasets import load_iris
    iris = load_iris()
    return pd.DataFrame(data=iris.data, columns=iris.feature_names), iris.target

# Calculate Gini Impurity
def gini_impurity(y):
    if len(y) == 0:
        return 0
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities ** 2)

# Split dataset based on a feature and a threshold
def split_dataset(X, y, feature_index, threshold):
    left_indices = X[:, feature_index] < threshold
    right_indices = X[:, feature_index] >= threshold
    return X[left_indices], y[left_indices], X[right_indices], y[right_indices]

# Find the best split for the dataset
def best_split(X, y):
    best_gini = float('inf')
    best_index = None
    best_threshold = None
    n_features = X.shape[1]
    
    for feature_index in range(n_features):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)
            gini_left = gini_impurity(y_left)
            gini_right = gini_impurity(y_right)
            gini = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)
            
            if gini < best_gini:
                best_gini = gini
                best_index = feature_index
                best_threshold = threshold
                
    return best_index, best_threshold

# Decision Tree Node
class DecisionTreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Build the Decision Tree
def build_tree(X, y, depth=0, max_depth=None):
    num_samples_per_class = [np.sum(y == i) for i in range(len(set(y)))]
    predicted_class = np.argmax(num_samples_per_class)
    
    # Stopping criteria
    if len(set(y)) == 1 or len(y) == 0 or (max_depth is not None and depth >= max_depth):
        return DecisionTreeNode(value=predicted_class)
    
    feature_index, threshold = best_split(X, y)
    
    if feature_index is None:
        return DecisionTreeNode(value=predicted_class)
    
    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)
    
    left_node = build_tree(X_left, y_left, depth + 1, max_depth)
    right_node = build_tree(X_right, y_right, depth + 1, max_depth)
    
    return DecisionTreeNode(feature_index, threshold, left_node, right_node)

# Predict using the Decision Tree
def predict(sample, tree):
    while tree.value is None:
        if sample[tree.feature_index] < tree.threshold:
            tree = tree.left
        else:
            tree = tree.right
    return tree.value

# Main function to train and predict
def main():
    # Load the data
    X, y = load_iris()
    X = X.values
    y = y
    
    # Build the tree
    tree = build_tree(X, y, max_depth=3)
    
    # Make predictions on the training set
    predictions = [predict(sample, tree) for sample in X]
    
    # Calculate accuracy
    accuracy = np.sum(predictions == y) / len(y)
    print(f'Accuracy: {accuracy * 100:.2f}%')

if __name__ == "__main__":
    main()

Accuracy: 66.67%


In [2]:
import numpy as np

# Sample dataset
# Columns: Feature1, Feature2, Feature3, Feature4, Target
data = np.array([
    [1, 2, 1, 3, 0],
    [1, 1, 3, 2, 1],
    [2, 2, 1, 1, 0],
    [2, 1, 3, 3, 1],
    [3, 3, 1, 2, 0]
])

# Helper function: Calculate Gini Impurity
def gini_impurity(groups, classes):
    total_instances = sum(len(group) for group in groups)
    gini = 0.0
    for group in groups:
        size = len(group)
        if size == 0:  # Avoid division by zero
            continue
        score = 0.0
        for class_val in classes:
            proportion = [row[-1] for row in group].count(class_val) / size
            score += proportion ** 2
        gini += (1 - score) * (size / total_instances)
    return gini

# Helper function: Split dataset
def split_data(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Find the best split
def get_best_split(dataset):
    classes = list(set(row[-1] for row in dataset))
    best_index, best_value, best_score, best_groups = 999, 999, 999, None
    for index in range(len(dataset[0]) - 1):  # Iterate through features
        for row in dataset:
            groups = split_data(index, row[index], dataset)
            gini = gini_impurity(groups, classes)
            if gini < best_score:
                best_index, best_value, best_score, best_groups = index, row[index], gini, groups
    return {'index': best_index, 'value': best_value, 'groups': best_groups}

# Build a decision tree
def build_tree(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:  # If no split
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:  # Max depth reached
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_best_split(left)
        build_tree(node['left'], max_depth, min_size, depth+1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_best_split(right)
        build_tree(node['right'], max_depth, min_size, depth+1)

# Terminal node prediction
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# Print a decision tree
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print(f"{' ' * depth}[X{node['index']} < {node['value']}]")
        print_tree(node['left'], depth + 1)
        print_tree(node['right'], depth + 1)
    else:
        print(f"{' ' * depth}[Leaf: {node}]")

# Predict with the tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Main execution
root = get_best_split(data)
build_tree(root, max_depth=3, min_size=1, depth=1)
print("Decision Tree:")
print_tree(root)

# Test prediction
new_data = [2, 2, 3, 1]
prediction = predict(root, new_data)
print(f"\nPrediction for {new_data}: {prediction}")


Decision Tree:
[X1 < 2]
 [X0 < 1]
  [Leaf: 1]
  [Leaf: 1]
 [X0 < 1]
  [Leaf: 0]
  [Leaf: 0]

Prediction for [2, 2, 3, 1]: 0


In [4]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, export_text

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Train Decision Tree
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X, y)

# Visualize the tree
tree_rules = export_text(tree_clf, feature_names=data.feature_names)
print(tree_rules)


|--- petal length (cm) <= 2.45
|   |--- class: 0
|--- petal length (cm) >  2.45
|   |--- petal width (cm) <= 1.75
|   |   |--- petal length (cm) <= 4.95
|   |   |   |--- class: 1
|   |   |--- petal length (cm) >  4.95
|   |   |   |--- class: 2
|   |--- petal width (cm) >  1.75
|   |   |--- petal length (cm) <= 4.85
|   |   |   |--- class: 2
|   |   |--- petal length (cm) >  4.85
|   |   |   |--- class: 2

