In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [97]:
# fetching data
df = pd.read_csv("/content/drive/MyDrive/ML/Dataset/train.csv")

df["Gender"] = df["Gender"].map({"Male": 1, "Female": 2, np.nan: 1})
df["Married"] = df["Married"].map({"Yes": 1, "No": 0, np.nan: 0})
df["Self_Employed"] = df["Self_Employed"].map({"Yes": 1, "No": 0, np.nan: 0})
df["LoanAmount"] = df["LoanAmount"].fillna(0)
df["Education"] = df["Education"].map({"Graduate": 1, "Not Graduate": 0})
df["Property_Area"] = df["Property_Area"].map({"Rular": 1, "Semiurban": 2, "Urban": 3})
df["Property_Area"] = df["Property_Area"].fillna(1)
df["Dependents"] = df["Dependents"].fillna(0)
df["Loan_Status"] = df["Loan_Status"].map({"Y": 1, "N": 0})
df.drop(columns=["Loan_ID"], inplace=True)
df["Dependents"] = df["Dependents"].replace("3+", 3)
df["Dependents"] = df["Dependents"].astype(int)

df.head()
print(df["Dependents"].unique())
# df.isnull().sum()
(df == "").sum()
print(df.dtypes)

[0 1 2 3]
Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area        float64
Loan_Status            int64
dtype: object


In [98]:
#splitting data
Y = df["Loan_Status"].to_numpy()
X = df.drop(columns=["Loan_Status"]).to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [99]:
# ---- Step 1: Compute Entropy ----
def entropy(y):
    """Computes entropy of labels."""
    _, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))  # Small epsilon to avoid log(0)

In [100]:
# ---- Step 2: Split Function ----
def split_data(X, y, feature, threshold):
    """Splits dataset into left and right subsets based on feature and threshold."""
    left_mask = X[:, feature] <= threshold
    right_mask = ~left_mask
    return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

In [101]:
# ---- Step 3: Find Best Split ----
def best_split(X, y):
    """Finds the best feature and threshold for splitting."""
    best_gain = 0
    best_feature = None
    best_threshold = None
    parent_entropy = entropy(y)

    for feature in range(X.shape[1]):  # Iterate over each feature
        thresholds = np.unique(X[:, feature])  # Unique values as possible thresholds
        for threshold in thresholds:
            X_left, X_right, y_left, y_right = split_data(X, y, feature, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue  # Skip empty splits

            # Compute weighted entropy after split
            p_left = len(y_left) / len(y)
            p_right = len(y_right) / len(y)
            new_entropy = p_left * entropy(y_left) + p_right * entropy(y_right)

            # Information Gain
            gain = parent_entropy - new_entropy

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

In [102]:
# ---- Step 4: Build Decision Tree ----
class Node:
    """Represents a node in the decision tree."""
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value


def build_tree(X, y, depth=0, max_depth=3):
    """Recursively builds the decision tree."""
    if len(np.unique(y)) == 1:  # Pure node (all labels are the same)
        return Node(value=y[0])

    if depth >= max_depth:  # Max depth reached
        return Node(value=np.bincount(y).argmax())  # Majority class

    feature, threshold = best_split(X, y)

    if feature is None:
        return Node(value=np.bincount(y).argmax())

    X_left, X_right, y_left, y_right = split_data(X, y, feature, threshold)
    left_child = build_tree(X_left, y_left, depth + 1, max_depth)
    right_child = build_tree(X_right, y_right, depth + 1, max_depth)

    return Node(feature, threshold, left_child, right_child)

In [103]:
# ---- Step 5: Prediction Function ----
def predict(tree, X):
    """Predicts classes for multiple samples."""
    if X.ndim == 1:  # Single sample case
        if tree.value is not None:
            return tree.value
        return predict(tree.left, X) if X[tree.feature] <= tree.threshold else predict(tree.right, X)

    # Multiple samples case
    return np.array([predict(tree, sample) for sample in X])


In [104]:
# ---- Step 6: Train the Decision Tree ----
tree = build_tree(X_train, Y_train)

In [105]:
# ---- Step 7: Make Predictions ----
prediction = predict(tree, X_test)
print("Accuracy: ", np.mean(prediction == Y_test))

Accuracy:  0.7580645161290323
