In [5]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [6]:


class AdaBoostUnderSample:
    
    def __init__(self, n_estimators=50, learning_rate=1.0, random_state=42, balance_ratio=0.5):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.random_state = random_state
        self.balance_ratio = balance_ratio
        self.estimators_ = []
        self.estimator_weights_ = np.zeros(self.n_estimators)
        self.errors_ = np.ones(self.n_estimators)
        
    def fit(self, X, y):
        # separate majority and minority classes
        X_majority = X[y == 0]
        X_minority = X[y == 1]
        y_minority = y[y == 1]
        
        # determine the number of samples to keep in the minority class
        n_minority = int(len(y_minority) * self.balance_ratio)
        
        # balance the minority class by randomly downsampling
        X_minority_balanced, y_minority_balanced = resample(X_minority, y_minority, replace=False, 
                                                            n_samples=n_minority, random_state=self.random_state)
        
        # balance the majority class by randomly upsampling
        n_majority = n_minority
        X_majority_balanced, y_majority_balanced = resample(X_majority, y[y == 0], replace=True, 
                                                            n_samples=n_majority, random_state=self.random_state)
        
        # concatenate the balanced minority and majority samples
        X_balanced = np.concatenate((X_minority_balanced, X_majority_balanced), axis=0)
        y_balanced = np.concatenate((y_minority_balanced, y_majority_balanced), axis=0)
        
        # initialize sample weights
        sample_weights = np.ones(len(y_balanced)) / len(y_balanced)
        
        for t in range(self.n_estimators):
            # train base estimator on weighted samples
            base_estimator = DecisionTreeClassifier(max_depth=1, random_state=self.random_state)
            base_estimator.fit(X_balanced, y_balanced, sample_weight=sample_weights)
            
            # compute error and estimator weight
            y_pred = base_estimator.predict(X_balanced)
            error = np.sum(sample_weights * (y_pred != y_balanced))
            estimator_weight = self.learning_rate * (np.log(1 - error) - np.log(error)) + np.log(1)
            
            # update sample weights
            sample_weights *= np.exp(-estimator_weight * y_balanced * y_pred)
            sample_weights /= np.sum(sample_weights)
            
            # save estimator and weight
            self.estimators_.append(base_estimator)
            self.estimator_weights_[t] = estimator_weight
            self.errors_[t] = error
        
        return self
    
    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for t in range(self.n_estimators):
            y_pred += self.estimator_weights_[t] * self.estimators_[t].predict(X)
        return np.sign(y_pred)
    
    def predict_proba(self, X):
        proba = np.zeros((X.shape[0], 2))
        for t in range(self.n_estimators):
            proba += self.estimator_weights_[t] * self.estimators_[t].predict_proba(X)
        proba /= np.sum(self.estimator_weights_)
        return proba
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))


In [14]:
# generate synthetic dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=10, n_redundant=5, random_state=42)

# split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('shape of X_train : ', X_train.shape)
# create AdaBoost classifier with under-sampling
adaboost = AdaBoostUnderSample(n_estimators=100 , learning_rate=0.9 , random_state=40 ,balance_ratio=0.5)

# fit the classifier to the training data
adaboost.fit(X_train, y_train)

# make predictions on the testing data
y_pred = adaboost.predict(X_test)

# evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


shape of X_train :  (8000, 20)
Accuracy: 0.718
