In [31]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import math
from google.colab import drive
drive.mount('/content/drive')
#importing iris dataset
df = pd.read_csv('/content/drive/MyDrive/data/iris.csv')
#replcing the names with numbers to make the data nominal
df= df.replace({'Iris-setosa':0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

#importing spambase dataset
df2 = pd.read_csv('/content/drive/MyDrive/data/spambase.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
#ALL FUNCTIONS USED

# checking if the features exist or empty
# was splitting over and over without this on 0 features
def can_split(X):
    return any(len(np.unique(X[:, col])) > 1 for col in range(X.shape[1]))

def most_common_label(y, default_label):
    if len(y) == 0: # if y in empty return default value of overall_most_common_label
        return default_label
    values, counts = np.unique(y, return_counts=True)
    most_common_index = np.argmax(counts)
    return values[most_common_index]

def entropy(y):
    unique_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum([p * np.log2(p) if p > 0 else 0 for p in probabilities])

def info_gain(X, y, feature_index, threshold):
    # Split dataset on threshold
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask

    # splitting data on the indices to branch into left and right based on threshold
    y_left, y_right = y[left_mask], y[right_mask]

    if len(y_left) == 0 or len(y_right) == 0:
        return 0

    entropy_parent = entropy(y)
    entropy_left, entropy_right = entropy(y_left), entropy(y_right)
    weighted_entropy_children = (len(y_left) / len(y)) * entropy_left + (len(y_right) / len(y)) * entropy_right

    return entropy_parent - weighted_entropy_children

def find_best_split(X, y):
    #initialising best gain as negative to compare and find maximum gain
    best_gain = -3
    best_feature = None
    best_threshold = None

    for feature_index in range(X.shape[1]):
        values = np.unique(X[:, feature_index])
        thresholds = [(values[i] + values[i + 1]) / 2 for i in range(len(values) - 1)]

        for threshold in thresholds:
            gain = info_gain(X, y, feature_index, threshold)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_index
                best_threshold = threshold

    return best_feature, best_threshold

def is_leaf(y, n_min): #checking if it is a leaf
    if len(np.unique(y)) == 1 or len(y) <= n_min:
        return True
    return False

def build_tree(X, y, n_min):
    if len(np.unique(y)) == 1 or len(y) <= n_min:
        # Return the most common label as a leaf node
        return np.argmax(np.bincount(y))

    feature, threshold = find_best_split(X, y)
    if feature is None:
        # No further information gain so return most common label
        return np.argmax(np.bincount(y))

    left_mask = X[:, feature] <= threshold
    right_mask = ~left_mask
    left_child = build_tree(X[left_mask], y[left_mask], n_min)
    right_child = build_tree(X[right_mask], y[right_mask], n_min)

    # Return a tuple representing the decision node: (feature index, threshold, left subtree, right subtree)
    return (feature, threshold, left_child, right_child)

In [None]:
#function called in def predict_samples:
def predict(tree, sample):
    if not isinstance(tree, tuple): #is the tree is not a tuple
        return tree  # Reached a leaf node

    feature, threshold, left_child, right_child = tree
    if sample[feature] <= threshold:
        return predict(left_child, sample)
    else:
        return predict(right_child, sample)

def predict_samples(tree, X):
    return np.array([predict(tree, sample) for sample in X])

def calculate_accuracy(X, y, tree):
    predictions = predict_samples(tree, X)
    return accuracy_score(y, predictions)


In [None]:
def cross_validate_decision_tree(X, y, nmin_values):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    results = []
    overall_most_common_label = most_common_label(y, None)

    for nmin in nmin_values:
        accuracies = []
        for train_index, test_index in kf.split(X):
            X_train_full, X_test = X[train_index], X[test_index]
            y_train_full, y_test = y[train_index], y[test_index]

            n_min_count = np.ceil(nmin / 100 * len(y_train_full)).astype(int) # nmin% of the y_train dataset

            tree = build_tree(X_train_full, y_train_full, n_min_count)
            accuracy = calculate_accuracy(X_test, y_test, tree)
            accuracies.append(accuracy)

        results.append((nmin, np.mean(accuracies)))
    return results

In [33]:
#for iris dataset

# last column is the target label and the rest are features
X_iris = df.iloc[:, :-1].values
y_iris_raw = df.iloc[:, -1]

# encoding target labels
unique_labels = y_iris_raw.unique()
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
y_iris = np.array([label_to_int[label] for label in y_iris_raw])

nmin_values = [5, 10,15,20]

# cross-validation
results_iris = cross_validate_decision_tree(X_iris, y_iris, nmin_values)

# Display results of accuracy
print("Iris Dataset:")
for result in results_iris:
    print(f"nmin = {result[0]}%: Average Accuracy = {result[1]:.4f}")

Iris Dataset:
nmin = 5%: Average Accuracy = 0.9395
nmin = 10%: Average Accuracy = 0.9467
nmin = 15%: Average Accuracy = 0.9467
nmin = 20%: Average Accuracy = 0.9467


In [34]:
#spambase dataset

#the last column is the target and the rest are features
X_spam = df2.iloc[:, :-1].values
y_spam_raw = df2.iloc[:, -1]

#encoding target labels
unique_labels = y_spam_raw.unique()
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
y_spam = np.array([label_to_int[label] for label in y_spam_raw])

nmin_values = [5, 10,15,20, 25]

#cross-validation
results_spam = cross_validate_decision_tree(X_spam, y_spam, nmin_values)

# Display results for Spambase dataset
print("Spambase Dataset:")
for result in results_spam:
    print(f"nmin = {result[0]}%: Average Accuracy = {result[1]:.4f}")

Spambase Dataset:
nmin = 5%: Average Accuracy = 0.9026
nmin = 10%: Average Accuracy = 0.8900
nmin = 15%: Average Accuracy = 0.8685
nmin = 20%: Average Accuracy = 0.8589
nmin = 25%: Average Accuracy = 0.8276
