In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load and encode the dataset
df = pd.read_csv("/content/mushrooms.csv")
print(df)
label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

FileNotFoundError: [Errno 2] No such file or directory: '/content/mushrooms.csv'

In [None]:
# Splitting dataset into 70-10-20
X = df.drop('class', axis=1)
y = df['class']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)
print(f"{'Training set size:':<22} {len(X_train)}")
print(f"{'Validation set size:':<22} {len(X_val)}")
print(f"{'Test set size:':<22} {len(X_test)}")

Training set size:     5686
Validation set size:   812
Test set size:         1626


In [None]:
# Implementing ID3 Tree
class TreeNode:
    def __init__(self, feature=None, value=None, left=None, right=None, label=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.label = label

class ID3Tree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def entropy(self, y):
        total = len(y)
        counts = Counter(y)
        return -sum((count/total) * np.log2(count/total) for count in counts.values())

    def best_split(self, X, y):
        best_gain = -1
        best_feature = None
        best_value = None
        parent_entropy = self.entropy(y)
        for feature in X.columns:
            values = X[feature].unique()
            for v in values:
                left = y[X[feature] == v]
                right = y[X[feature] != v]
                if len(left) == 0 or len(right) == 0:
                    continue
                weighted = (len(left)*self.entropy(left) + len(right)*self.entropy(right)) / len(y)
                gain = parent_entropy - weighted
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_value = v
        return best_feature, best_value

    def build(self, X, y, depth=0):
        if len(set(y)) == 1:
            return TreeNode(label=y.iloc[0])
        if self.max_depth is not None and depth >= self.max_depth:
            return TreeNode(label=y.mode()[0])
        feature, value = self.best_split(X, y)
        if feature is None:
            return TreeNode(label=y.mode()[0])
        left_idx = X[feature] == value
        right_idx = ~left_idx
        left = self.build(X[left_idx], y[left_idx], depth+1)
        right = self.build(X[right_idx], y[right_idx], depth+1)
        return TreeNode(feature=feature, value=value, left=left, right=right)

    def fit(self, X, y):
        self.root = self.build(X, y)

    def predict_one(self, row, node):
        if node.label is not None:
            return node.label
        if row[node.feature] == node.value:
            return self.predict_one(row, node.left)
        else:
            return self.predict_one(row, node.right)

    def predict(self, X):
        return [self.predict_one(row, self.root) for _, row in X.iterrows()]

In [None]:
# Prediction of ID3 Tree's accuracy without pruning
id3 = ID3Tree()
id3.fit(X_train, y_train)
y_pred_id3 = id3.predict(X_test)
acc_id3 = accuracy_score(y_test, y_pred_id3)

In [None]:
# Prediction of ID3 Tree's accuracy with data pruning
best_acc = 0
best_model = None

for d in range(1, 26):
    model = ID3Tree(max_depth=d)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    if acc > best_acc:
        best_acc = acc
        best_model = model

y_pred_pruned = best_model.predict(X_test)
acc_pruned = accuracy_score(y_test, y_pred_pruned) -  0.001 * d # penalize deeper trees


In [None]:
# Prediction of scikit-learn's accuracy for comparison
sk_tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
sk_tree.fit(X_train, y_train)
y_pred_sklearn = sk_tree.predict(X_test)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)

In [None]:
print("="*60)
print("COMPARISON OF MODELS ON TEST DATA")
print("="*60)

print("ID3 Decision Tree (without Pruning)")
print(classification_report(y_test, y_pred_id3, zero_division=0))
print("Accuracy:", acc_id3)
print()

print("ID3 Decision Tree (with Pruning)")
print(classification_report(y_test, y_pred_pruned, zero_division=0))
print("Accuracy:", acc_pruned)
print()

print("Scikit-learn Decision Tree")
print(classification_report(y_test, y_pred_sklearn, zero_division=0))
print("Accuracy:", acc_sklearn)


COMPARISON OF MODELS ON TEST DATA
ID3 Decision Tree (without Pruning)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       848
           1       1.00      1.00      1.00       778

    accuracy                           1.00      1626
   macro avg       1.00      1.00      1.00      1626
weighted avg       1.00      1.00      1.00      1626

Accuracy: 1.0

ID3 Decision Tree (with Pruning)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       848
           1       1.00      1.00      1.00       778

    accuracy                           1.00      1626
   macro avg       1.00      1.00      1.00      1626
weighted avg       1.00      1.00      1.00      1626

Accuracy: 0.975

Scikit-learn Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       848
           1       1.00      1.00      1.00       778

    accuracy        