In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            bootstrap_indices = np.random.choice(len(X), size=len(X), replace=True)
            X_sampled = X.iloc[bootstrap_indices]
            y_sampled = y.iloc[bootstrap_indices]

            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=42
            )
            tree.fit(X_sampled, y_sampled)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.round(np.mean(predictions, axis=0))

df = pd.read_csv('../dataset/diabetes.csv')
X = df.drop(columns=['Outcome'], axis=1)
y = df['Outcome']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
n_splits = 6
indices = np.arange(len(X))
np.random.shuffle(indices)

fold_indices = np.array_split(indices, n_splits)


for i in range(n_splits):
    test_indices = fold_indices[i]
    train_indices = np.concatenate(fold_indices[:i] + fold_indices[i+1:])

    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

n_trees = 100

random_forest = RandomForestClassifier(n_estimators=n_trees)

random_forest.fit(X_train, y_train)

rf_predictions = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, rf_predictions)
print("Accuracy:", accuracy)

Accuracy: 0.796875


In [2]:
cm = confusion_matrix(y_test, rf_predictions)
tp = cm[0,0]
fp = cm[0,1]
fn = cm[1,0]
tn = cm[1,1]
accuracy = (tp+tn)/(tp+fp+fn+tn)
print(accuracy)

0.796875
