In [11]:
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

display(iris_df.head())

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [12]:
from sklearn.model_selection import train_test_split
X = iris_df.drop('target', axis=1)
y = iris_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (120, 4)
Testing set shape: (30, 4)


In [13]:
import numpy as np

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = -np.sum([(counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def info_gain(data, split_feature, target_feature):
    total_entropy = entropy(data[target_feature])
    vals, counts = np.unique(data[split_feature], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.loc[data[split_feature] == vals[i], target_feature]) for i in range(len(vals))])
    information_gain = total_entropy - weighted_entropy
    return information_gain

def split_data(data, split_feature, split_value):
    split_data = data[data[split_feature] == split_value].reset_index(drop=True)
    return split_data

def find_best_split_feature(data, target_feature):
    features = data.columns.drop(target_feature)
    max_info_gain = -1
    best_feature = None
    for feature in features:
        feature_info_gain = info_gain(data, feature, target_feature)
        if feature_info_gain > max_info_gain:
            max_info_gain = feature_info_gain
            best_feature = feature
    return best_feature

def build_tree(data, target_feature, parent_node_class=None):
    if len(np.unique(data[target_feature])) <= 1 or len(data.columns) == 1:
        return np.unique(data[target_feature])[0]
    elif len(data) == 0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_feature])[np.argmax(np.unique(data[target_feature], return_counts=True)[1])]
        best_feature = find_best_split_feature(data, target_feature)
        tree = {best_feature: {}}
        for value in np.unique(data[best_feature]):
            sub_data = split_data(data, best_feature, value)
            subtree = build_tree(sub_data, target_feature, parent_node_class)
            tree[best_feature][value] = subtree
        return tree
c45_tree = build_tree(pd.concat([X_train, y_train], axis=1), 'target')
print(c45_tree)

{'petal length (cm)': {np.float64(1.0): np.int64(0), np.float64(1.1): np.int64(0), np.float64(1.2): np.int64(0), np.float64(1.3): np.int64(0), np.float64(1.4): np.int64(0), np.float64(1.5): np.int64(0), np.float64(1.6): np.int64(0), np.float64(1.7): np.int64(0), np.float64(1.9): np.int64(0), np.float64(3.0): np.int64(1), np.float64(3.3): np.int64(1), np.float64(3.5): np.int64(1), np.float64(3.7): np.int64(1), np.float64(3.8): np.int64(1), np.float64(3.9): np.int64(1), np.float64(4.0): np.int64(1), np.float64(4.1): np.int64(1), np.float64(4.2): np.int64(1), np.float64(4.3): np.int64(1), np.float64(4.4): np.int64(1), np.float64(4.5): {'sepal length (cm)': {np.float64(4.9): np.int64(2), np.float64(5.4): np.int64(1), np.float64(5.6): np.int64(1), np.float64(6.0): np.int64(1), np.float64(6.4): np.int64(1)}}, np.float64(4.6): np.int64(1), np.float64(4.7): np.int64(1), np.float64(4.8): {'sepal length (cm)': {np.float64(5.9): np.int64(1), np.float64(6.0): np.int64(2), np.float64(6.2): np.int64

In [14]:
def predict(instance, tree):
    if not isinstance(tree, dict):
        return tree
    else:
        root_node = list(tree.keys())[0]
        split_value = instance[root_node]
        if split_value in tree[root_node]:
            return predict(instance, tree[root_node][split_value])
        else:
            return list(tree[root_node].values())[0]


def evaluate(tree, X_test, y_test):
    predictions = [predict(X_test.iloc[i], tree) for i in range(len(X_test))]
    accuracy = np.mean(predictions == y_test)
    return accuracy
accuracy = evaluate(c45_tree, X_test, y_test)
print(f"Accuracy of the C4.5 tree: {accuracy:.2f}")

Accuracy of the C4.5 tree: 0.80


In [18]:
def print_tree(node, depth=0, feature_names=None):
    indent = "  " * depth
    if isinstance(node, dict):
        feature = list(node.keys())[0]
        print(f"{indent}Split on feature: {feature}")
        for value, subtree in node[feature].items():
            print(f"{indent}  Value == {value}:")
            print_tree(subtree, depth + 1, feature_names)
    else:
        print(f"{indent}  Predict: {node}")
try:
    feature_names = iris.feature_names
except NameError:
    feature_names = None

print_tree(c45_tree, feature_names=feature_names)

Split on feature: petal length (cm)
  Value == 1.0:
    Predict: 0
  Value == 1.1:
    Predict: 0
  Value == 1.2:
    Predict: 0
  Value == 1.3:
    Predict: 0
  Value == 1.4:
    Predict: 0
  Value == 1.5:
    Predict: 0
  Value == 1.6:
    Predict: 0
  Value == 1.7:
    Predict: 0
  Value == 1.9:
    Predict: 0
  Value == 3.0:
    Predict: 1
  Value == 3.3:
    Predict: 1
  Value == 3.5:
    Predict: 1
  Value == 3.7:
    Predict: 1
  Value == 3.8:
    Predict: 1
  Value == 3.9:
    Predict: 1
  Value == 4.0:
    Predict: 1
  Value == 4.1:
    Predict: 1
  Value == 4.2:
    Predict: 1
  Value == 4.3:
    Predict: 1
  Value == 4.4:
    Predict: 1
  Value == 4.5:
  Split on feature: sepal length (cm)
    Value == 4.9:
      Predict: 2
    Value == 5.4:
      Predict: 1
    Value == 5.6:
      Predict: 1
    Value == 6.0:
      Predict: 1
    Value == 6.4:
      Predict: 1
  Value == 4.6:
    Predict: 1
  Value == 4.7:
    Predict: 1
  Value == 4.8:
  Split on feature: sepal length (cm)