In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('cleaned.csv')
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi,cardio
0,50,1,168,62.0,110,80,1,1,0,0,1,21.967120,0
1,55,0,156,85.0,140,90,3,1,0,0,1,34.927679,1
2,51,0,165,64.0,130,70,3,1,0,0,0,23.507805,1
3,48,1,169,82.0,150,100,1,1,0,0,1,28.710479,1
4,47,0,156,56.0,100,60,1,1,0,0,0,23.011177,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66787,57,0,165,80.0,150,80,1,1,0,0,1,29.384757,1
66788,61,0,158,126.0,140,90,2,2,0,0,1,50.472681,1
66789,52,1,183,105.0,180,90,3,1,0,1,0,31.353579,1
66790,61,0,163,72.0,135,80,1,2,0,0,0,27.099251,1


In [8]:
X = df.drop(columns=['cardio'])
y = df['cardio']

In [9]:
def gini_index(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    gini = 1 - np.sum(probs ** 2)
    return gini

In [10]:
def split_dataset(X, y, feature, threshold):
    left_mask = X[feature] <= threshold
    right_mask = X[feature] > threshold

    X_left = X[left_mask]
    y_left = y[left_mask]

    X_right = X[right_mask]
    y_right = y[right_mask]

    return X_left, y_left, X_right, y_right

In [11]:
def gini_split(y_left, y_right):
    n_left = len(y_left)
    n_right = len(y_right)
    n_total = n_left + n_right

    if n_left == 0 or n_right == 0:
        return 1

    gini_left = gini_index(y_left)
    gini_right = gini_index(y_right)

    weighted_gini = (
        (n_left / n_total) * gini_left +
        (n_right / n_total) * gini_right
    )

    return weighted_gini

In [12]:
def best_split(X, y):
    best_feature = None
    best_threshold = None
    best_gini = float('inf')

    for feature in X.columns:
        thresholds = np.unique(X[feature])

        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split_dataset(
                X, y, feature, threshold
            )

            gini = gini_split(y_left, y_right)

            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold, best_gini

In [13]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [14]:
def build_tree(X, y, depth=0, max_depth=5, min_samples=10):

    if len(np.unique(y)) == 1:
        return Node(value=y.iloc[0])

    if depth >= max_depth or len(y) < min_samples:
        return Node(value=y.mode()[0])

    feature, threshold, gini = best_split(X, y)

    if feature is None:
        return Node(value=y.mode()[0])

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature, threshold)

    left_node = build_tree(X_left, y_left, depth+1, max_depth, min_samples)
    right_node = build_tree(X_right, y_right, depth+1, max_depth, min_samples)

    return Node(feature, threshold, left_node, right_node)

In [15]:
def predict_one(node, x):
    if node.value is not None:
        return node.value

    if x[node.feature] <= node.threshold:
        return predict_one(node.left, x)
    else:
        return predict_one(node.right, x)

In [16]:
def predict(node, X):
    return X.apply(lambda row: predict_one(node, row), axis=1)

In [17]:
split = int(0.8 * len(X))

X_train = X.iloc[:split]
y_train = y.iloc[:split]
X_test = X.iloc[split:]
y_test = y.iloc[split:]

In [18]:
tree = build_tree(X_train, y_train)

In [19]:
y_pred = predict(tree, X_test)

In [20]:
correct = sum(y_test.values == y_pred.values)
accuracy = correct / len(y_test)
print("Accuracy:",round(accuracy,3))

Accuracy: 0.725
