# Adaboost [classifier + regressor] from scratch

In [18]:
# Regression Implementation yet to be added

## Import statements

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

## Load and prepare data

In [2]:
df = pd.read_csv("Iris.csv")
df = df.drop("Id", axis=1)
df = df.rename(columns={"species": "label"})

In [3]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# Types of lables
df['label'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [5]:
# Frequency plot
df['label'].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x238a0b1a518>

## Helper functions

In [6]:
def train_test_split(df, test_size=0.8, random_state=None):
    train_df = df.sample(frac=test_size, random_state=random_state)
    test_df = df[~df.index.isin(train_df.index)]
    return train_df.sort_index(), test_df.sort_index()

In [7]:
def accuracy_score_classification(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [8]:
def accuracy_score_regression(y_true, y_pred):
    rmse = np.sqrt(np.sum((y_true - y_pred)**2) / len(y_true)) # RMSE
    return rmse

## Algorithm Classes

- DecisionStump()
- BaseBoostingAlgorithm()
- AdaBoostClassifier()
- AdaBoostRegressor()

In [9]:
class DecisionStump():
    def __init__(self):
        # Feature/Attribute Index to consider for splitting
        self.decision_feature_index = None
        # Exact value from Feature/Attribute to split on
        self.decision_threshold_value = None
        # Stump importance / weight
        self.weight = None
        # Stump error
        self.error = None
        # Left leaf value
        self.left_leaf_value = None
        # Right leaf value
        self.right_leaf_value = None
        # Stump decision compartor 
        self.decision_comparator = None
        print("New Stump Created!")

In [23]:
class BaseBoostingAlgorithm():
    def __init__(self, n_learners):
        self.n_learners = n_learners
        
    def fit(self, X, y):
        # Store all weak learners (Weak learner -> A decsion stump)
        self.learners = []
        # Identify each feature type in input X and store as list
        self.feature_types = self._determine_type_of_feature(X)
        # Concatenate input and output
        self.data = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)
        # Initialize weight for each example as 1/N (where N -> total number of examples)
        self.sample_weight = np.full(len(self.data), np.divide(1, len(self.data)))        
        print(self.feature_types)
        print(self.ml_task)
        
        # Iterate and build learners
        for i_boost in range(self.n_learners):
            # Instantiate a new decision stump object
            learner = DecisionStump()
            # Find and Perform split over best feature 
            potential_splits = self._get_potential_splits(self.data)
            
            split_column_index, split_value, metric = self._determine_best_split(self.data, self.sample_weight, potential_splits, self.ml_task)
            left_node_data, right_node_data = self._split_data(self.data, split_column_index, split_value)
            print(f'split_column_index: {split_column_index}, split_value: {split_value}')
            print(f'Change in overall_metric: {metric}')
            # Compute Leaf values
            left_leaf_value = self._create_leaf(left_node_data, self.ml_task)
            right_leaf_value = self._create_leaf(right_node_data, self.ml_task)
            print(f'Left leaf: {left_leaf_value}, Right leaf: {right_leaf_value}')

            # Allocate the instantiated learner with our computed values
            learner.decision_feature_index = split_column_index
            learner.decision_threshold_value = split_value
            learner.left_leaf_value = left_leaf_value
            learner.right_leaf_value = right_leaf_value
            learner.decision_comparator = self.feature_types[split_column_index]
            
            # Boosting step
            self.sample_weight, learner = self.boost(i_boost,
                                              self.data,
                                              self.sample_weight,
                                              learner)
            # Early Termination
            if self.sample_weight is None:
                break
            # Stop boosting since error is 0
            # Stop if the sum of sample weights has become non-positive
            if learner.error == 0 or np.sum(self.sample_weight) <= 0:
                self.learners.append(learner)
                break
            print(f'{i_boost}: Sample weight(sum) {np.sum(self.sample_weight)}')
            # Dont perform operations in below conditional block if we are on final learner
            if not i_boost == self.n_learners - 1:
                # Normalize
                self.sample_weight /= np.sum(self.sample_weight)
                # Construct new data set sample based on sample_weight
#                 self.data = self._sample_data_by_weights(self.data, self.sample_weight)
                # Reinitialize equal sample weights for the new data
#                 self.sample_weight = np.full(len(self.data), (1 / len(self.data)))
            
            # Add this learner to our main list of learners
            self.learners.append(learner)
            print(f'Total stumps: {len(self.learners)}')
            
        return self  
            
    def stump_predict(self, data, learner):
        """
        Computes prediction for the passed data examples w.r.t to the learner(descision stump) 
        """
        preds = []
        feature_column = data[:, learner.decision_feature_index]
        for value in feature_column:
            if learner.decision_comparator == 'categorical':
                if value == learner.decision_threshold_value: # Left node
                    pred = learner.left_leaf_value
                else: # right node
                    pred = learner.right_leaf_value
            else: # continuous
                if value <= learner.decision_threshold_value: # Left node
                    pred = learner.left_leaf_value
                else: # right node
                    pred = learner.right_leaf_value
            preds.append(pred)
        return preds
    
    def _fast_gini(self, data):
        label_column = data[:, -2]
        data_sample_weight =  data[:, -1]
        _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
        # Get summed weights for each class
        class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])
        
        weighted_classes = counts * class_weights
        normalized_weighted_classes = weighted_classes / sum(weighted_classes)
        #return 1.0 - (np.sum(counts**2) / np.sum(class_weights)**2)
        return 1.0 - sum(normalized_weighted_classes**2)
    
    
    def _calculate_weighted_mse(self, data):
        """
        Calculate weighted mean squared error
        """
        actual_values = data[:, -2]
        data_sample_weight =  data[:, -1]
        
        if len(actual_values) == 0:   # empty data
            mse = 0

        else:
            prediction = np.mean(actual_values)
            # Not normalizing using sum of weighted mean, beacuse the sum of weighted mean is 1
            weighted_mse = np.mean((data_sample_weight * (actual_values - prediction))**2)

        return weighted_mse
    
    
    def _calculate_weighted_gini_index(self, data):
        """
        Calculate weighted gini index
        """
        label_column = data[:, -2]
        data_sample_weight =  data[:, -1]
        #_, counts = np.unique(label_column, return_counts=True)
        _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)
        # Get summed weights for each class
        class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])
    
        weighted_classes = counts * class_weights
        normalized_weighted_classes = weighted_classes / sum(weighted_classes)
        
        #class_counts_squared = np.sum(counts**2)
        #class_weights_squared = np.sum(class_weights)**2
        
        weighted_gini_impurity = (1 - sum(normalized_weighted_classes**2))
        #print(f'class_weights_squared: {class_weights_squared}')
        #weighted_gini_impurity = 1.0 - np.divide(class_counts_squared, class_weights_squared)
        return weighted_gini_impurity
    
    def _calculate_weighted_overall_metric(self, data, left_node_data, right_node_data, metric_function):
        """
        Generalized impurity metric, computes weighted overall
        impurity/error w.r.t left and right nodes
        """
        # Labels
        left_label_column = left_node_data[:, -2]
        right_label_column = right_node_data[:, -2]
        parent_label_column = data[:, -2]
        # Sample weights
        left_sample_weight = left_node_data[:, -1]
        right_sample_weight = right_node_data[:, -1]
        parent_sample_weight = data[:, -1]
        
        if self.ml_task == 'classification':
            _, left_value_indexes, left_counts = np.unique(left_label_column, return_counts=True, return_index=True)
            _, right_value_indexes, right_counts = np.unique(right_label_column, return_counts=True, return_index=True)
            _, parent_value_indexes, parent_counts = np.unique(parent_label_column, return_counts=True, return_index=True)
            
            left_class_weights = np.array([np.take(left_sample_weight, np.where(left_label_column == left_label_column[value_index])[0]).sum() for value_index in left_value_indexes])
            right_class_weights = np.array([np.take(right_sample_weight, np.where(right_label_column == right_label_column[value_index])[0]).sum() for value_index in right_value_indexes])
            parent_class_weights = np.array([np.take(parent_sample_weight, np.where(parent_label_column == parent_label_column[value_index])[0]).sum() for value_index in parent_value_indexes])
            
            # class count * class weight, for respective classes
            left_weighted_classes = left_counts * left_class_weights
            right_weighted_classes = right_counts * right_class_weights
            parent_weighted_classes = parent_counts * parent_class_weights
            
#             weighted_prob_node_left = np.divide(np.sum(left_weighted_classes), np.sum(parent_weighted_classes))
#             weighted_prob_node_right = np.divide(np.sum(right_weighted_classes), np.sum(parent_weighted_classes))
            
            #weighted_prob_node_left = np.sum(left_weighted_classes / np.sum(parent_class_weights))
            #weighted_prob_node_right = np.sum(right_weighted_classes / np.sum(parent_class_weights))
            
            weighted_prob_node_left = np.sum(left_class_weights)# / np.sum(parent_weighted_classes)
            weighted_prob_node_right = np.sum(right_class_weights)# / np.sum(parent_weighted_classes)
            
        else:
            total_parent_sample_weight = np.sum(np.sum(left_sample_weight), np.sum(right_sample_weight))
            # Weighted probabilities of left and right node
            weighted_prob_node_left = np.sum(left_sample_weight) / total_parent_sample_weight
            weighted_prob_node_right = np.sum(right_sample_weight) / total_parent_sample_weight
        
        overall_metric =  weighted_prob_node_left * metric_function(left_node_data) + weighted_prob_node_right * metric_function(right_node_data)
        
        # Compute final overall metric
        #print(f'Left gini: {metric_function(left_node_data)}, Right gini: {metric_function(right_node_data)}')
        #overall_metric = (weighted_prob_node_left * metric_function(left_node_data)) + (weighted_prob_node_right * metric_function(right_node_data))
        #print(f'weighted_prob_node_left * w_i(left_node_data)): {weighted_prob_node_left} * {metric_function(left_node_data)} = {(weighted_prob_node_left * metric_function(left_node_data))}')
        #print(f'weighted_prob_node_right * w_i(right_node_data)): {weighted_prob_node_right} * {metric_function(right_node_data)} = {(weighted_prob_node_right * metric_function(right_node_data))}')
        return overall_metric

    
    def _sample_data_by_weights(self, data, sample_weight):
        """
        Construct an new input, iteratively sampled over distribution 
        formed by passed sample_weight.

        Note: 
        Learn more about this technique: https://youtu.be/LsK-xG1cLYA (Statquest)
        """
        n_samples, _ = np.shape(data)
        # Intialize array to hold sampled index  
        sampled_indices = []
        # Perform cumulative summation over sample_weight to create buckets
        sample_weight_buckets = np.cumsum(sample_weight)
        # Keeping sampling 'n_samples' times
        for _ in range(n_samples):
            # Generate a random number between 0 and 1
            random_num = np.random.random_sample()
            # Find the bucket Eg: weight buckets [0.33, 0.66, 0.99] and random number = 0.47
            # then index 1 will be selected (since cumsum value is 0.66)
            bucket_index = np.where(sample_weight_buckets > random_num)[0][0]

            sampled_indices.append(bucket_index)
        # finally construct weighted data using sampled_indexes
        weighted_data = data[sampled_indices]

        return weighted_data


    def _get_potential_splits(self, data):
        """
        Get all potential splits for each feature
        Splits can be made on each unique value
        Can essentially make a split at each unique value
        
        """
        potential_splits = {}
        _, n_columns = data.shape
        for column_index in range(n_columns - 1): # excluding the last column which is the label
            values = data[:, column_index]
            unique_values = np.unique(values)

            potential_splits[column_index] = unique_values
            
        return potential_splits
    
    
    def _calculate_overall_metric(self, left_node_data, right_node_data, metric_function):
        """
        Generalized impurity metric, computes weighted overall
        impurity/error w.r.t left and right nodes
        """
        n = len(left_node_data) + len(right_node_data)
        # Probabilities of left and right node
        prob_node_left = len(left_node_data) / n
        prob_node_right = len(right_node_data) / n

        overall_metric =  (prob_node_left * metric_function(left_node_data) 
                         + prob_node_right * metric_function(right_node_data))

        return overall_metric
    
    def _determine_best_split(self, data, sample_weight, potential_splits, ml_task):
        """
        Iterate over each column_index (as keys) in potential_split (dict)
        Perform split(of examples) over each unique value and evaluate the split
        Identify the best split and return its feature index and value
        """
        # Stitch data with sample_weight towards the end
        data = np.concatenate((data, np.expand_dims(sample_weight, axis=1)), axis=1)
        
        # Best minimum gini index to be updated iteratively
        best_overall_metric = float('inf')
        
        for column_index in potential_splits:
            #print(f"COLUMN {column_index}")
            for value in potential_splits[column_index]:
                print(f'column_index: {column_index}, value: {value}')
                left_node_data, right_node_data = self._split_data(data, split_column_index=column_index, split_value=value)

                if ml_task == "regression":
                    current_overall_metric = self._calculate_weighted_overall_metric(data, left_node_data, right_node_data,
                                                                                     metric_function=self._calculate_weighted_mse)
                else: # classification
                    current_overall_metric = self._calculate_weighted_overall_metric(data, left_node_data, right_node_data,
                                                                                     metric_function=self._fast_gini)

                # If a lower overall_metric is achieved update the index and value with the current
                if current_overall_metric < best_overall_metric:
                    best_overall_metric = current_overall_metric
                    best_split_column_index = column_index
                    best_split_value = value
                #print(f'best_overall_metric: {best_overall_metric}')
                #print('---')
            #print(f'Debug [1]: Best: {best_overall_metric}, index: {best_split_column_index}, value: {best_split_value}')
        return best_split_column_index, best_split_value, best_overall_metric
    
    
    def _split_data(self, data, split_column_index, split_value):
        """ 
        Split data(examples) based on best split_column_index and split_value
        estimated using task specific splitting metric.
        """
        # Get values(from feature column) for the passed split_column index
        split_column_values = data[:, split_column_index]

        type_of_feature = self.feature_types[split_column_index]
        if type_of_feature == "continuous":
            left_node_data = data[split_column_values <= split_value]
            right_node_data = data[split_column_values >  split_value]

        # feature is categorical   
        else:
            left_node_data = data[split_column_values == split_value]
            right_node_data = data[split_column_values != split_value]
        return left_node_data, right_node_data
    
    
    def _create_leaf(self, data, ml_task):
        """
        Create leaf node, with leaf value based on ml_task
        for,
        Classfication: consider majority vote
        Regression: consider the mean value
        """
        label_column = data[:, -1]
        if ml_task == "regression":
            leaf = np.mean(label_column)

        # classfication    
        else:
            unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
            index = counts_unique_classes.argmax()
            leaf = unique_classes[index]

        return leaf
    
    
    def _determine_type_of_feature(self, X):
        """
        Determine, if the feature is categorical or continuous
        """
        feature_types = []
        n_unique_values_treshold = 15 # Threshold for a numeric feature to be categorical
        
        n_samples, n_features = np.shape(X)
        
        for feature_i in range(n_features):
            unique_values = np.unique(X[:, feature_i])
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")

        return feature_types
    

In [24]:
class AdaBoostClassifier(BaseBoostingAlgorithm):
    def __init__(self, n_learners=20):
        # Set total number of weak learners
        super().__init__(n_learners)
        self.ml_task = "classification"
        self.classes = None
        self.n_classes = None
        
    def boost(self, i_boost, data, sample_weight, learner):
        """
        Compute learner importance and error, along with boosted weights for each example 
        """
        print(f'Boost Called')
        
        # If its first boost initialize number of classes(n_classes)
        if i_boost == 0:
            self.classes = np.unique(data[:, -1])
            self.n_classes = self.classes.size
            
        # Perform predictions
        preds = self.stump_predict(data, learner)
        
        # Incorrectly classified examples
        incorrect = preds != data[:, -1]

        # Learner Error
        learner_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))
        # Stop if classification is perfect
        if learner_error <= 0:
            learner.weight = 1
            learner.error = 0
            return sample_weight, learner
        print(f'Learner error: {learner_error}')
        
        # Learner weight
        learner_weight = (np.log((1 - learner_error) / (learner_error)) +
                        np.log(self.n_classes - 1))
        print(f'Learner weight: {learner_weight}')
        
        # Boost sample_weight for each each sample
        # Dont boost sample_weight if we are on final learner
        if not i_boost == self.n_learners - 1:
        # Boost only positive weights
            sample_weight *= np.exp(learner_weight * incorrect *
                                    ((sample_weight > 0) | (learner_weight < 0)))

        # Allocate learner its computed weight and error
        learner.weight = learner_weight
        learner.error = learner_error

        # Finally return sample weights and boosted learner
        return sample_weight, learner
         
    def predict(self, X):
        """
        Predict classes for X.
        """
        print(f'Predict called')
        
        # Get activated matrix for with respect to each learner [get vote of each learner]
        # Add each activated matrix (matrix addition) [get overall vote of all leaners]
        # return the overall matrix
        # Argmax is used over each row of overall matrix to figure our the class
        classes = self.classes[:, np.newaxis]
        pred = sum((self.stump_predict(X, learner) == classes).T * learner.weight
                   for learner in self.learners)
        # Normalize 
        learner_weights = sum(learner.weight for learner in self.learners)
        pred /= learner_weights
        
        # If its binary classification obatin the form [-, +], convienient to select classes with np.take() 
        # Eg(binary): classes =  [[c1], [c2]] and pred = [True, False, True], below output: [[c2], [c1], [c2]]
        if self.n_classes == 2:
            pred[:, 0] *= -1
            pred = pred.sum(axis=1)
            return classes.take(pred > 0, axis=0)
        # Finds index of column with max value, and uses this index to select class from classes

        return classes.take(np.argmax(pred, axis=1), axis=0)
        

In [25]:
clf = AdaBoostClassifier(4)

In [26]:
clf = clf.fit(df.iloc[:, :-1].values, df.iloc[:, -1].values)

['continuous', 'continuous', 'continuous', 'continuous']
classification
New Stump Created!
column_index: 0, value: 4.3
weighted_prob_node_left * w_i(left_node_data)): 0.006666666666666667 * 0.0 = 0.0
weighted_prob_node_right * w_i(right_node_data)): 0.9933333333333323 * 0.6665473782369379 = 0.662103729048691
column_index: 0, value: 4.4
weighted_prob_node_left * w_i(left_node_data)): 0.02666666666666667 * 0.0 = 0.0
weighted_prob_node_right * w_i(right_node_data)): 0.9733333333333323 * 0.66472533691266 = 0.6469993279283217
column_index: 0, value: 4.5
weighted_prob_node_left * w_i(left_node_data)): 0.03333333333333333 * 0.0 = 0.0
weighted_prob_node_right * w_i(right_node_data)): 0.9666666666666657 * 0.6636187484960929 = 0.6414981235462225
column_index: 0, value: 4.6
weighted_prob_node_left * w_i(left_node_data)): 0.060000000000000005 * 0.0 = 0.0
weighted_prob_node_right * w_i(right_node_data)): 0.9399999999999991 * 0.6566483766233984 = 0.6172494740259938
column_index: 0, value: 4.7
weight

In [27]:
predictions = clf.predict(df.iloc[:, :-1].values)

Predict called


In [17]:
accuracy_score_classification(df.iloc[:, -1].values, predictions[:, -1])

0.88