## Question 1

In [3]:
import pandas as pd
import numpy as np

In [4]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, output=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.output = output

In [5]:
def calculate_entropy(y):
    if len(y) == 0:
        return 0
    probs = np.bincount(y) / len(y)
    return -np.sum([p * np.log2(p) for p in probs if p > 0])

In [6]:
def split_data(x, y, feature, threshold):
    left_indices = x[:, feature] == threshold
    right_indices = ~left_indices
    return x[left_indices], y[left_indices], x[right_indices], y[right_indices]

In [7]:
def choose_best_feature(x, y, criterion='entropy'):
    best_gain = -1
    best_feature = -1
    best_threshold = None
    base_impurity = calculate_entropy(y) if criterion == 'entropy' else calculate_gini(y)

    for feature in range(x.shape[1]):
        thresholds = np.unique(x[:, feature])
        for threshold in thresholds:
            x_left, y_left, x_right, y_right = split_data(x, y, feature, threshold)

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            if criterion == 'entropy':
                gain = base_impurity - (len(y_left) / len(y) * calculate_entropy(y_left) +
                                         len(y_right) / len(y) * calculate_entropy(y_right))
            else:
                gini = (len(y_left) / len(y)) * calculate_gini(y_left) + (len(y_right) / len(y)) * calculate_gini(y_right)
                gain = base_impurity - gini

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

In [17]:
def build_tree(x, y, criterion='entropy'):
    if len(set(y)) == 1:
        return Node(output=y[0]) 

    best_feature, best_threshold = choose_best_feature(x, y, criterion)

    if best_feature == -1:
        return Node(output=np.random.choice(y))

    x_left, y_left, x_right, y_right = split_data(x, y, best_feature, best_threshold)

    left_node = build_tree(x_left, y_left, criterion)
    right_node = build_tree(x_right, y_right, criterion)

    return Node(feature=best_feature, threshold=best_threshold, left=left_node, right=right_node)

In [9]:
def predict(tree, sample):
    if tree.output is not None:
        return tree.output

    if sample[tree.feature] == tree.threshold:
        return predict(tree.left, sample)
    else:
        return predict(tree.right, sample)

In [10]:
def encode_features(x):
    encodings = {}
    for i in range(x.shape[1]):
        unique_vals = list(set(x[:, i]))
        encodings[i] = {val: idx for idx, val in enumerate(unique_vals)}
        x[:, i] = [encodings[i][val] for val in x[:, i]]
    return x, encodings

In [11]:
def classify(sample, encoders, tree):
    sample_encoded = [encoders[i][sample[i]] for i in range(len(sample))]
    return predict(tree, sample_encoded)

In [16]:
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)

x = df[['Outlook', 'Temperature', 'Humidity', 'Wind']].values
y = np.array([1 if label == 'Yes' else 0 for label in df['Decision']])

x, encoders = encode_features(x)

c45_tree = build_tree(x, y, criterion='entropy')

sample = ['Sunny', 75, 70, 'Weak']

result_c45 = classify(sample, encoders, c45_tree)
print("C4.5:", "Yes" if result_c45 == 1 else "No")

C4.5: Yes


## Question 2

In [18]:
def calculate_gini(y):
    if len(y) == 0:
        return 0
    probs = np.bincount(y) / len(y)
    return 1 - np.sum(probs**2)

In [19]:
cart_tree = build_tree(x, y, criterion='gini')

result_cart = classify(sample, encoders, cart_tree)
print("CART:", "Yes" if result_cart == 1 else "No")

CART: Yes


## Question 3

In [21]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder

data = {
    'Income': ['Low', 'Low', 'Medium', 'Medium', 'High', 'High'],
    'Credit': ['Good', 'Bad', 'Good', 'Bad', 'Good', 'Bad'],
    'Loan Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)
x = df[['Income', 'Credit']].values
y = np.array([1 if label == 'Yes' else 0 for label in df['Loan Approved']])

x, encoders = encode_features(x)

c45_tree = build_tree(x, y, criterion='entropy')
cart_tree = build_tree(x, y, criterion='gini')

sample = ['Medium', 'Good']

result_c45 = classify(sample, encoders, c45_tree)
print("C4.5:", "Yes" if result_c45 == 1 else "No")

result_cart = classify(sample, encoders, cart_tree)
print("CART:", "Yes" if result_cart == 1 else "No")

C4.5: Yes
CART: Yes


In [22]:
def sklearn_decision_tree(x, y, criterion='entropy'):
    clf = DecisionTreeClassifier(criterion=criterion)
    clf.fit(x, y)
    return clf

x = df[['Income', 'Credit']].apply(LabelEncoder().fit_transform).values
y = LabelEncoder().fit_transform(df['Loan Approved'])

clf_c45 = sklearn_decision_tree(x, y, criterion='entropy')
clf_cart = sklearn_decision_tree(x, y, criterion='gini')

result_c45 = clf_c45.predict([x[2]])
result_cart = clf_cart.predict([x[2]])

print("scikit-learn C4.5:", "Yes" if result_c45[0] == 1 else "No")
print("scikit-learn CART:", "Yes" if result_cart[0] == 1 else "No")

scikit-learn C4.5: Yes
scikit-learn CART: Yes
