In [52]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [53]:
def load_and_preprocess(path):
    df = pd.read_csv(path)

    # Keep relevant columns
    df = df[['Pclass', 'Gender', 'Age', 'Fare', 'Embarked', 'Survived']]

    # TODO: Drop rows with missing values
    # TODO: Convert Age to AgeGroup (Child if < 16, else Adult)
    # TODO: Encode categorical features ('Gender', 'Embarked', 'AgeGroup')

    # TODO: Prepare X and y
    # Return train-test split

    df.dropna(inplace=True)
    df['AgeGroup'] = df['Age'].apply(lambda x: 'Child' if x < 16 else 'Adult')
    df.drop('Age', axis=1, inplace=True)
    df = pd.get_dummies(df, columns=['Gender', 'Embarked', 'AgeGroup'])
    X = df.drop('Survived', axis=1)
    y = df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test





In [54]:
def entropy(y):
    # TODO: Implement entropy calculation

    counts = np.bincount(y)
    probabilities = counts / len(y)
    entropy = -np.sum([p * np.log2(p) for p in probabilities if p > 0])
    return entropy




In [55]:
def information_gain(X_column, y, threshold):
    # TODO: Calculate information gain for a split

    y_left = y[X_column < threshold]
    y_right = y[X_column >= threshold]

    parent_entropy = entropy(y)
    if len(y_left) == 0 or len(y_right) == 0:
        return 0

    left_entropy = entropy(y_left)
    right_entropy = entropy(y_right)

    prob_left = len(y_left) / len(y)
    prob_right = len(y_right) / len(y)

    child_entropy = prob_left * left_entropy + prob_right * right_entropy
    information_gain = parent_entropy - child_entropy

    return information_gain




In [56]:
def best_split(X, y):
    # TODO: Loop through features and thresholds to find best split

    best_split = {}
    best_info_gain = -1

    for feature in X.columns:
        X_column = X[feature]
        thresholds = np.sort(X_column.unique())

        for threshold in thresholds:
            info_gain = information_gain(X_column, y, threshold)

            if info_gain > best_info_gain:
                best_split = {
                    'feature': feature,
                    'threshold': threshold,
                    'info_gain': info_gain
                }
                best_info_gain = info_gain

    return best_split




In [57]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [58]:
def build_tree(X, y, depth=0, max_depth=5):
    # TODO: Recursive tree building using ID3

    if depth == max_depth or len(y) == 0:  # Check if maximum depth or empty y
        if len(y) > 0:
            most_common_label = Counter(y).most_common(1)[0][0]
        else:
            # Handle the case when y is empty, e.g., assign a default value
            most_common_label = 0 # or 1, or another relevant default
        return Node(value=most_common_label)

    else:
        bestsplit = best_split(X, y)
        if not bestsplit:
            most_common_label = Counter(y).most_common(1)[0][0]
            return Node(value=most_common_label)

        left_indices = X[bestsplit['feature']] < bestsplit['threshold']
        right_indices = ~left_indices
        left_subtree = build_tree(X[left_indices], y[left_indices], depth + 1, max_depth)
        right_subtree = build_tree(X[right_indices], y[right_indices], depth + 1, max_depth)
        return Node(bestsplit['feature'], bestsplit['threshold'], left_subtree, right_subtree)


In [59]:
def predict_one(x, node):
    # TODO: Predict for single sample by traversing the tree

    if node.value is not None:
        return node.value

    # Access the feature value using .loc for pandas Series
    if x.loc[node.feature] < node.threshold:
        return predict_one(x, node.left)
    else:
        return predict_one(x, node.right)



In [60]:
def predict(X, tree):
    # TODO: Predict for all samples

    y_pred = [predict_one(x, tree) for _, x in X.iterrows()]
    return y_pred



In [61]:
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_and_preprocess("/titanic.csv")

    tree = build_tree(X_train, y_train, max_depth=5)
    y_pred = predict(X_test, tree)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Accuracy: 0.7972027972027972
              precision    recall  f1-score   support

           0       0.79      0.88      0.83        80
           1       0.81      0.70      0.75        63

    accuracy                           0.80       143
   macro avg       0.80      0.79      0.79       143
weighted avg       0.80      0.80      0.79       143

