In [23]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
# from sklearn.tree import HellingerDecisionTree  #not working for me !
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification


In [24]:
## -->> I dont know if it is correct or not from part A ! 
class HellingerDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        
    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)
        
    def predict(self, X):
        y_pred = []
        for x in X:
            y_pred.append(self._predict_tree(x, self.tree))
        return np.array(y_pred)
        
    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        
        if depth == self.max_depth or n_samples == 1:
            return np.round(y.mean())
        
        feature_idxs = np.random.choice(n_features, size=int(np.sqrt(n_features)), replace=False)
        
        best_feature, best_threshold = self._best_criteria(X, y, feature_idxs)
        left_idxs, right_idxs = self._split(X[:, best_feature], best_threshold)
        
        left = self._build_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._build_tree(X[right_idxs, :], y[right_idxs], depth+1)
        
        return (best_feature, best_threshold, left, right)
    
    def _best_criteria(self, X, y, feature_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        for feature_idx in feature_idxs:
            X_feature = X[:, feature_idx]
            thresholds = np.unique(X_feature)
            for threshold in thresholds:
                gain = self._information_gain(y, X_feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature_idx
                    split_threshold = threshold
                    
        return split_idx, split_threshold
        
    def _information_gain(self, y, X_feature, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_feature, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        entropy_l, entropy_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l / n) * entropy_l + (n_r / n) * entropy_r
        return parent_entropy - child_entropy
    
    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = sum(probabilities * -np.log2(probabilities))
        return entropy
    
    def _split(self, X_feature, threshold):
        left_idxs = np.argwhere(X_feature <= threshold).flatten()
        right_idxs = np.argwhere(X_feature > threshold).flatten()
        return left_idxs, right_idxs
    
    def _predict_tree(self, x, tree):
        if isinstance(tree, np.float64):
            return tree
        feature, threshold, left, right = tree
        if x[feature] <= threshold:
            return self._predict_tree(x, left)
        else:
            return self._predict_tree(x, right)


In [25]:
# some other implemention of HDDT : not sure if its true ! 

import numpy as np

class Node:
    """
    A class to represent a node in the decision tree.
    """
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value


class HellingerDistanceDecisionTree:
    """
    A decision tree based on Hellinger distance.
    """
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None

    def fit(self, X, y):
        """
        Fit the decision tree on the given training data.
        """
        self.root = self._build_tree(X, y)

    def predict(self, X):
        """
        Predict the labels of the given data using the trained tree.
        """
        predictions = []
        for i in range(X.shape[0]):
            node = self.root
            while node.left:
                if X[i, node.feature_idx] < node.threshold:
                    node = node.left
                else:
                    node = node.right
            predictions.append(node.value)
        return predictions

    def _build_tree(self, X, y, depth=0):
        """
        Build the decision tree recursively.
        """
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Check for termination conditions
        if (self.max_depth is not None and depth >= self.max_depth) \
                or n_samples < self.min_samples_split \
                or len(np.unique(y)) == 1:
            leaf_value = self._leaf_value(y)
            return Node(value=leaf_value)

        # Find the best split
        best_feature_idx, best_threshold = self._best_split(X, y, n_samples, n_features, n_classes)

        # Check for termination conditions after finding the best split
        if best_feature_idx is None or best_threshold is None:
            leaf_value = self._leaf_value(y)
            return Node(value=leaf_value)

        left_idxs = X[:, best_feature_idx] < best_threshold
        right_idxs = ~left_idxs

        # Recursively build the left and right subtrees
        left = self._build_tree(X[left_idxs], y[left_idxs], depth + 1)
        right = self._build_tree(X[right_idxs], y[right_idxs], depth + 1)

        return Node(best_feature_idx, best_threshold, left, right)

    def _best_split(self, X, y, n_samples, n_features, n_classes):
        """
        Find the best feature and threshold to split the data.
        """
        best_gain = -np.inf
        best_feature_idx = None
        best_threshold = None

        # Calculate the Hellinger distance for the parent node
        parent_value = self._leaf_value(y)
        parent_score = self._hddt_criterion(y, parent_value, n_classes)

        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])

            for threshold in thresholds:
                left_idxs = X[:, feature_idx] < threshold
                n_left = np.sum(left_idxs)
                n_right = n_samples - n_left

                if n_left >= self.min_samples_leaf and n_right >= self.min_samples_leaf:
                    left_value = self._leaf_value(y[left_idxs])
                    right_value = self._leaf_value(y[~left_idxs])
                    gain = self._information_gain(parent_score, left_value, right_value)

                    if gain > best_gain:
                        best_gain = gain
                        best_feature_idx = feature_idx
                        best_threshold = threshold

        return best_feature_idx, best_threshold
    
    def _hddt_criterion(self, y, value, n_classes):
        """
        Compute the Hellinger distance-based decision tree criterion.
        """
        # Compute the histogram of classes in the data
        hist = np.histogram(y, bins=n_classes, range=(0, n_classes))[0]

        # Compute the probabilities of each class in the data
        p = hist / len(y)

        # Compute the histogram of classes in the left and right subsets
        left_hist = np.histogram(value[0], bins=n_classes, range=(0, n_classes))[0]
        right_hist = np.histogram(value[1], bins=n_classes, range=(0, n_classes))[0]

        # Compute the probabilities of each class in the left and right subsets
        left_p = left_hist / len(value[0])
        right_p = right_hist / len(value[1])

        # Compute the Hellinger distance between the parent node and its two children
        dist_left = np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(left_p))**2))
        dist_right = np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(right_p))**2))

        # Compute the gain in Hellinger distance
        gain = np.abs(dist_left - dist_right)

        return gain

    def _leaf_value(self, y):
        """
        Compute the value of a leaf node (i.e., the class that appears most frequently in the data).
        """
        hist = np.histogram(y, bins=len(np.unique(y)))[0]
        return np.argmax(hist)




In [26]:
#this part i have to implement !
class BaggingClassifier:
    
    def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, random_state=None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.random_state = random_state
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []
        self.classes_ = None
        self.n_features_ = None
        
    def fit(self, X, y):
        rng = np.random.RandomState(self.random_state)
        self.classes_ = np.unique(y)
        self.n_features_ = X.shape[1]
        for i in range(self.n_estimators):
            indices = rng.choice(len(X), int(self.max_samples * len(X)), replace=True)
            features = rng.choice(self.n_features_, int(self.max_features * self.n_features_), replace=False)
            estimator = self.base_estimator.fit(X[indices][:, features], y[indices])
            self.estimators_.append(estimator)
            self.estimators_samples_.append(indices)
            self.estimators_features_.append(features)
    
    def predict(self, X):
        predictions = np.zeros((X.shape[0], self.n_estimators), dtype=int)
        for i, estimator in enumerate(self.estimators_):
            predictions[:, i] = estimator.predict(X[:, self.estimators_features_[i]])
        return np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=predictions)
    
    def predict_proba(self, X):
        probas = np.zeros((X.shape[0], self.n_estimators, len(self.classes_)))
        for i, estimator in enumerate(self.estimators_):
            probas[:, i, :] = estimator.predict_proba(X[:, self.estimators_features_[i]])
        return np.mean(probas, axis=1)
 

In [27]:
# Load the dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, weights=[0.1, 0.9], random_state=42)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the base learners
# dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=5)
# hddt_clf = HellingerDecisionTree(criterion='entropy', max_depth=5)   -->> ToDo : define HellingerDecisionTree!
# dt_clf = DecisionTreeClassifier()
hddt_clf = HellingerDecisionTree(criterion='entropy', max_depth=5)


# Define the Bagging classifier with the base learners
bagging_hddt_clf = BaggingClassifier(base_estimator=hddt_clf, n_estimators=100, max_samples=0.5, max_features=0.5)
# bagging_hddt_clf = BaggingClassifier(base_estimator=hddt_clf, n_estimators=100, max_samples=0.5, max_features=0.5)

bagging_dt_clf = BaggingClassifier(base_estimator=dt_clf, n_estimators=100, max_samples=0.5, max_features=0.5)

# Train the Bagging classifiers on the training set
bagging_hddt_clf.fit(X_train, y_train)
bagging_dt_clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_hddt = bagging_hddt_clf.predict(X_test)
y_pred_dt = bagging_dt_clf.predict(X_test)



# Evaluate performance metrics
precision = precision_score(y_test, y_pred_hddt)
recall = recall_score(y_test, y_pred_hddt)
f1 = f1_score(y_test, y_pred_hddt)
roc_auc = roc_auc_score(y_test, y_pred_hddt)
confusion = confusion_matrix(y_test, y_pred_hddt)
classification = classification_report(y_test, y_pred_hddt)
accuracy = accuracy_score(y_test, y_pred_hddt)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred_hddt)
g_mean = np.sqrt(recall * (1 - precision))

# Print the performance metrics
print('Performance metrics for Bagging with Hellinger decision trees:')
print('Precision: {:.4f}'.format(precision))
print('Recall: {:.4f}'.format(recall))
print('F1-score: {:.4f}'.format(f1))
print('ROC-AUC score: {:.4f}'.format(roc_auc))
print('Confusion matrix:\n', confusion)
print('Classification report:\n', classification)

TypeError: __init__() got an unexpected keyword argument 'criterion'