In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("../dataset/play_tennis.csv")

attributes = ['outlook', 'temp', 'humidity', 'wind']
target_attribute = 'play'

X = dataset[attributes]
y = dataset[target_attribute]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
n_splits = 5
indices = np.arange(len(X))
np.random.shuffle(indices)

fold_indices = np.array_split(indices, n_splits)


for i in range(n_splits):
    test_indices = fold_indices[i]
    train_indices = np.concatenate(fold_indices[:i] + fold_indices[i+1:])

    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

In [2]:
def calculate_entropy(labels):
    label_counts = {}
    for i in labels:
        if i in label_counts:
            label_counts[i] += 1
        else:
            label_counts[i] = 1

    entropy = 0
    total_samples = len(labels)
    for count in label_counts.values():
        pi = count / total_samples
        entropy -= pi * math.log2(pi)

    return entropy

In [3]:
def calculate_information_gain(attribute_values, labels):
    total_entropy = calculate_entropy(labels)
    attribute_value_counts = {}
    weighted_entropy = 0

    for value in set(attribute_values):
        subset_labels = [labels[i] for i in range(len(attribute_values)) if attribute_values[i] == value]
        weight = len(subset_labels) / len(labels)
        attribute_value_counts[value] = len(subset_labels)
        weighted_entropy += weight * calculate_entropy(subset_labels)

    information_gain = total_entropy - weighted_entropy

    return information_gain, attribute_value_counts

In [4]:
def build_decision_tree(data, target_attribute, attributes):
    if len(set(data[target_attribute])) == 1:
        return data[target_attribute].iloc[0]

    if len(attributes) == 0:
        return data[target_attribute].mode().iloc[0]
    best_attribute = max(attributes, key=lambda attr: calculate_information_gain(data[attr], data[target_attribute])[0])
    attribute_values = set(data[best_attribute])

    decision_tree = {best_attribute: {}}
    for value in attribute_values:
        subset_data = data[data[best_attribute] == value].reset_index(drop=True)
        decision_tree[best_attribute][value] = build_decision_tree(subset_data, target_attribute, [attr for attr in attributes if attr != best_attribute])

    return decision_tree
decision_tree = build_decision_tree(dataset, target_attribute, attributes)

In [5]:
def predict(data, decision_tree):
    attribute = list(decision_tree.keys())[0]
    value = data[attribute]

    if value in decision_tree[attribute]:
        prediction = decision_tree[attribute][value]

        if isinstance(prediction, dict):
            return predict(data, prediction)
        else:
            return prediction

In [6]:
def batch_predict(data, decision_tree):
    return [predict(sample, decision_tree) for _, sample in data.iterrows()]

test_predictions = batch_predict(X_test, decision_tree)

# Compare predictions with actual labels in the testing set
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': test_predictions})
print(comparison)

# Calculate accuracy
accuracy = sum(comparison['Actual'] == comparison['Predicted']) / len(comparison)
print(f"Accuracy: {accuracy * 100:.2f}%")

  Actual Predicted
8    Yes       Yes
1     No        No
Accuracy: 100.00%


In [7]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, test_predictions)
tp = cm[0,0]
fp = cm[0,1]
fn = cm[1,0]
tn = cm[1,1]
accuracy = (tp+tn)/(tp+fp+fn+tn)
print(accuracy)

1.0
