# Gradient Boosting Decision Trees  Classifier

In [3]:
import numpy as np
from DecisionTrees import DecisionTrees
from scipy.special import expit  # Sigmoid function

class GBDTClassifier:

    def __init__(self, n_estimators=100, learning_rate=0.1, min_samples_split=2, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X,y):
        # step 1: convert y to{-1,1}
        y = np.where(y == 0,-1,1)

        # initialize the model with the log-odds of
        p = np.mean(y)
        self.initial_prediction = np.log(p / (1 - p))

        # initialize predictions with the Log-odds value
        y_pred = np.full(X.shape[0], self.initial_prediction)

        # fit trees seqientially to minimize log loss
        for _ in range(self.n_estimators):
            # calculate the gradient 
            gradient = y - expit(y_pred) # y - sigmoid(pred)

            # Fit a tree to the gradient (pseudo-residuals)
            tree = DecisionTrees(min_samples_split=self.min_samples_split, max_depth=self.max_depth, task='regression')
            tree.fit(X, gradient)

            # save the tree
            self.trees.append(tree)

            #update the predictions 
            y_pred += self.learning_rate * tree.predict(X)

    def _predict_proba(self, X):
        # Step 3: make predictions by summing contributions from all trees
        # start with the inital prediction
        y_pred = np.full(X.shape[0], self.initial_prediction)

        # add the contribution from each tree
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)

        # convert raw score to probabilities using the sigmoid function
        proba = expit(y_pred)
        return np.vstack([1 - proba, proba]).T  # Return probabilities for class 0 and class 1
    
    def predict(self, X):
        # predict the class based on the threshold of 0.5 
        proba = self._predict_proba(X)[:,1]
        return np.where(proba >= 0.5, 1, 0)
            

## Example Usage

In [12]:
from random import random
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

data = datasets.load_breast_cancer()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234
)

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

clf = GBDTClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

acc =  accuracy(y_test, predictions)
print(acc)

0.9210526315789473
