# Multilabel Adaboost [Regressor] from scratch

## Import statements

In [1]:
import numpy as np
import pandas as pd

## Load and prepare data [Regression]

https://www.kaggle.com/marklvl/bike-sharing-dataset

In [2]:
reg_df = pd.read_csv("Bike.csv", parse_dates=["dteday"])
reg_df = reg_df.drop(["instant", "casual", "registered"], axis=1)
reg_df = reg_df.rename({"dteday": "date"}, axis=1)

In [3]:
reg_df.head(3)

Unnamed: 0,date,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349


In [4]:
reg_X_df = reg_df.iloc[:, :-1] # Input raw df
reg_y_df = reg_df.iloc[:, -1] # Output raw df

In [5]:
# Derive new features from date column
date_column = reg_X_df.date

reg_X_df["day_of_year"] = date_column.dt.dayofyear
reg_X_df["day_of_month"] = date_column.dt.day
reg_X_df["quarter"] = date_column.dt.quarter
reg_X_df["week"] = date_column.dt.week
reg_X_df["is_month_end"] = date_column.dt.is_month_end
reg_X_df["is_month_start"] = date_column.dt.is_month_start
reg_X_df["is_quarter_end"] = date_column.dt.is_quarter_end
reg_X_df["is_quarter_start"] = date_column.dt.is_quarter_start
reg_X_df["is_year_end"] = date_column.dt.is_year_end
reg_X_df["is_year_start"] = date_column.dt.is_year_start

In [6]:
reg_X_df.head(3)

Unnamed: 0,date,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,...,day_of_year,day_of_month,quarter,week,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,...,1,1,1,52,False,True,False,True,False,True
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,...,2,2,1,52,False,False,False,False,False,False
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,...,3,3,1,1,False,False,False,False,False,False


In [7]:
reg_y_df.head(3)

0     985
1     801
2    1349
Name: cnt, dtype: int64

## Helper functions

In [8]:
def train_test_split(df, test_size=0.8, random_state=None):
    train_df = df.sample(frac=test_size, random_state=random_state)
    test_df = df[~df.index.isin(train_df.index)]
    return train_df.sort_index(), test_df.sort_index()

In [9]:
def accuracy_score_classification(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
    return accuracy

In [10]:
def accuracy_score_regression(y_true, y_pred):
    rmse = np.sqrt(np.sum((y_true - y_pred)**2) / len(y_true)) # RMSE
    return rmse

In [11]:
def r2_score(y, y_pred):
    """
    R2 Score
    How much(%) of the total variation in y is explained by variation in x(fitted line)
    """
    mean_y = np.mean(y)
    SE_total_variation = np.sum((y - mean_y)**2) # Unexplained max possible variation in y wrt->Mean
    SE_line_variation = np.sum((y - y_pred)**2) # Unexplained variation in y wrt -> fitted line
    r2_score = 1 - (SE_line_variation / SE_total_variation) # Expalined = 1 - Unexplained
    return r2_score

## Algorithm Classes

- DecisionStump()
- BaseBoostingAlgorithm()
- AdaBoostClassifier()
- AdaBoostRegressor()

In [12]:
class DecisionStump():
    def __init__(self):
        # Feature/Attribute Index to consider for splitting
        self.decision_feature_index = None
        # Exact value from Feature/Attribute to split on
        self.decision_threshold_value = None
        # Stump importance / weight
        self.weight = None
        # Stump error
        self.error = None
        # Left leaf value
        self.left_leaf_value = None
        # Right leaf value
        self.right_leaf_value = None
        # Stump decision compartor 
        self.decision_comparator = None
        print("New Stump Created!")

In [90]:
class BaseBoostingAlgorithm():
    def __init__(self, n_learners):
        self.n_learners = n_learners
        
    def fit(self, X, y):
        # Store all weak learners (Weak learner -> A decsion stump)
        self.learners = []
        # Identify each feature type in input X and store as list
        self.feature_types = self._determine_type_of_feature(X)
        # Concatenate input and output
        self.data = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)
        # Initialize weight for each example as 1/N (where N -> total number of examples)
        self.sample_weight = np.full(len(self.data), np.divide(1, len(self.data)))        
        print(self.feature_types)
        print(self.ml_task)
        #print(self._get_potential_splits(self.data))
        # Iterate and build learners
        for i_boost in range(self.n_learners):
            # Instantiate a new decision stump object
            learner = DecisionStump()
            # Find and Perform split over best feature 
            potential_splits = self._get_potential_splits(self.data)
            split_column_index, split_value, metric = self._determine_best_split(self.data, self.sample_weight, potential_splits, self.ml_task)
            left_node_data, right_node_data = self._split_data(self.data, self.sample_weight, split_column_index, split_value)
            print(f'split_column_index: {split_column_index}, split_value: {split_value}')
            print(f'Change in overall_metric: {metric}')
            # Compute Leaf values
            left_leaf_value = self._create_leaf(left_node_data, self.ml_task)
            right_leaf_value = self._create_leaf(right_node_data, self.ml_task)
            print(f'Left leaf: {left_leaf_value}, Right leaf: {right_leaf_value}')

            # Allocate the instantiated learner with our computed values
            learner.decision_feature_index = split_column_index
            learner.decision_threshold_value = split_value
            learner.left_leaf_value = left_leaf_value
            learner.right_leaf_value = right_leaf_value
            learner.decision_comparator = self.feature_types[split_column_index]
            
            print(self.sample_weight[:5])
            # Boosting step
            self.sample_weight, learner = self.boost(i_boost,
                                              self.data,
                                              self.sample_weight,
                                              learner)
            # Early Termination
            if self.sample_weight is None:
                break
            # Stop boosting since error is 0
            # Stop if the sum of sample weights has become non-positive
            if learner.error == 0 or np.sum(self.sample_weight) <= 0:
                self.learners.append(learner)
                break
            print(f'{i_boost}: Sample weight(sum) [Raw] {np.sum(self.sample_weight)}')
            # Dont perform operations in below conditional block if we are on final learner
            if not i_boost == self.n_learners - 1:
                # Normalize
                self.sample_weight /= np.sum(self.sample_weight)
                
                # Note(Alternative): Sample data(examples) based on sample_weight
                # Construct new data set sample based on sample_weight
#                 self.data = self._sample_data_by_weights(self.data, self.sample_weight)
                # Reinitialize equal sample weights for the new data
#                 self.sample_weight = np.full(len(self.data), (1 / len(self.data)))
            print(f'{i_boost}: Sample weight(sum) [Normalized] {np.sum(self.sample_weight)}')
            # Add this learner to our main list of learners
            self.learners.append(learner)
            print(f'Total stumps: {len(self.learners)}')
            
        return self  
            
    def stump_predict(self, data, learner):
        """
        Computes prediction for the passed data examples w.r.t to the learner(descision stump) 
        """
        preds = []
        feature_column = data[:, learner.decision_feature_index]
        for value in feature_column:
            if learner.decision_comparator == 'categorical':
                if value == learner.decision_threshold_value: # Left node
                    pred = learner.left_leaf_value
                else: # right node
                    pred = learner.right_leaf_value
            else: # continuous
                if value <= learner.decision_threshold_value: # Left node
                    pred = learner.left_leaf_value
                else: # right node
                    pred = learner.right_leaf_value
            preds.append(pred)
        return preds
    

    def _gini_sk(self, data):
        if len(data) <= 0:
            return None
        label_column = data[:, -2]
        data_sample_weight =  data[:, -1]
        _, value_indexes, counts = np.unique(label_column, return_counts=True, return_index=True)

        class_weights = np.array([np.take(data_sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in value_indexes])
        
        cw = np.sum(class_weights**2)
        wn = np.sum(data_sample_weight)**2
        gini = 1.0 - (cw/wn)
        #print(f"cw: {cw}, wn: {wn} ---> 1.0 - (cw/wn) == {1.0} - {cw}/{wn}")
        return gini
    
    def _calculate_weighted_mse(self, data):
        """
        Calculate weighted mean squared error
        """
        if len(data) <= 0:
            return None
        actual_values = data[:, -2]
        data_sample_weight =  data[:, -1]
        
        if len(actual_values) == 0:   # empty data
            mse = 0

        else:
            #prediction = np.mean(actual_values)
            prediction = np.average(actual_values, weights=data_sample_weight)
            # ! Not normalizing using sum of weighted mean, beacuse the sum of weighted mean is 1
            #mse = np.mean((data_sample_weight * (actual_values - prediction))**2)
            #mse = np.average((actual_values - prediction)**2, weights=data_sample_weight)
            mse = np.mean((actual_values - prediction)**2)

        return mse
    
    
    def _mse_sk(self, data):
        """
        Calculate weighted mean squared error
        """
        if len(data) <= 0:
            return None
        
        actual_values = data[:, -2]
        data_sample_weight =  data[:, -1]
        
        if len(actual_values) == 0:   # empty data
            mse = 0

        else:
            weighted_y = data_sample_weight * actual_values
            sum_total = np.sum(weighted_y)
            squared_sum_total = np.sum(weighted_y * actual_values) # w * y * y (or w * y^2)
            weights_total = np.sum(data_sample_weight)
            
            impurity = (squared_sum_total / weights_total) - (sum_total / weights_total)**2
            
#             #prediction = np.mean(actual_values)
#             prediction = np.average(actual_values, weights=data_sample_weight)
#             # ! Not normalizing using sum of weighted mean, beacuse the sum of weighted mean is 1
#             #mse = np.mean((data_sample_weight * (actual_values - prediction))**2)
#             #mse = np.average((actual_values - prediction)**2, weights=data_sample_weight)
#             mse = np.mean((actual_values - prediction)**2)

        return impurity

    def _mse_var(self, data):
        if len(data) <= 0:
            return None
        actual_values = data[:, -2]
        data_sample_weight =  data[:, -1]
        
        if len(actual_values) == 0:   # empty data
            mse = 0
        else:
            mean_prediction = np.average(actual_values, weights=data_sample_weight)
            max_dissipation = np.sum((actual_values - mean_prediction)**2)
            normalizer = (len(data) - 1)
            #print(f"max_dissipation: {max_dissipation}")
            #print(f"normalizer: {normalizer}")
            if normalizer == 0:
                return None
            mse = max_dissipation / normalizer
#             weighted_y = data_sample_weight * actual_values
#             sum_total = np.sum(weighted_y)
#             squared_sum_total = np.sum(weighted_y * actual_values)
#             mse = squared_sum_total / np.sum(data_sample_weight)
#             mse -= (sum_total / np.sum(data_sample_weight))**2
        
        return mse
        
    
    def _calculate_weighted_overall_metric(self, data, left_node_data, right_node_data, metric_function):
        """
        Generalized impurity metric, computes weighted overall
        impurity/error w.r.t left and right nodes
        """
        # Labels
        left_label_column = left_node_data[:, -2]
        right_label_column = right_node_data[:, -2]
        parent_label_column = data[:, -2]
        # Sample weights
        left_sample_weight = left_node_data[:, -1]
        right_sample_weight = right_node_data[:, -1]
        parent_sample_weight = data[:, -1]
        
        if self.ml_task == 'classification':
            _, left_value_indexes, left_counts = np.unique(left_label_column, return_counts=True, return_index=True)
            _, right_value_indexes, right_counts = np.unique(right_label_column, return_counts=True, return_index=True)
            
            aggregated_left_class_weights = np.array([np.take(left_sample_weight, np.where(left_label_column == left_label_column[value_index])[0]).sum() for value_index in left_value_indexes])
            aggregated_right_class_weights = np.array([np.take(right_sample_weight, np.where(right_label_column == right_label_column[value_index])[0]).sum() for value_index in right_value_indexes])
            
            weighted_prob_node_left = np.sum(aggregated_left_class_weights)
            weighted_prob_node_right = np.sum(aggregated_right_class_weights)
            
        else:
            #total_parent_sample_weight = np.sum(np.sum(left_sample_weight), np.sum(right_sample_weight))
            # Weighted probabilities of left and right node
            weighted_prob_node_left = np.sum(left_sample_weight)# / total_parent_sample_weight
            weighted_prob_node_right = np.sum(right_sample_weight)# / total_parent_sample_weight
        
        left_impurity = metric_function(left_node_data)
        right_impurity = metric_function(right_node_data)
        
        if left_impurity != None and right_impurity != None:
            if self.ml_task == 'classification': 
                overall_metric =  weighted_prob_node_left * left_impurity + weighted_prob_node_right * right_impurity
            else:
                overall_metric = left_impurity + right_impurity
            #print(f'weighted_prob_node_left * w_i(left_node_data)): {weighted_prob_node_left} * {left_impurity} = {(weighted_prob_node_left * left_impurity)}')
            #print(f'weighted_prob_node_right * w_i(right_node_data)): {weighted_prob_node_right} * {right_impurity} = {(weighted_prob_node_right * right_impurity)}')
            return overall_metric
        else:
            return None

    # Alternative method to weighted loss: Which works by random sampling of example from a distribution based on sample_weight
    def _sample_data_by_weights(self, data, sample_weight):
        """
        Construct an new input, iteratively sampled over distribution 
        formed by passed sample_weight.

        Note: 
        Learn more about this technique: https://youtu.be/LsK-xG1cLYA (Statquest)
        """
        n_samples, _ = np.shape(data)
        # Intialize array to hold sampled index  
        sampled_indices = []
        # Perform cumulative summation over sample_weight to create buckets
        sample_weight_buckets = np.cumsum(sample_weight)
        # Keeping sampling 'n_samples' times
        for _ in range(n_samples):
            # Generate a random number between 0 and 1
            random_num = np.random.random_sample()
            # Find the bucket Eg: weight buckets [0.33, 0.66, 0.99] and random number = 0.47
            # then index 1 will be selected (since cumsum value is 0.66)
            bucket_index = np.where(sample_weight_buckets > random_num)[0][0]

            sampled_indices.append(bucket_index)
        # finally construct weighted data using sampled_indexes
        weighted_data = data[sampled_indices]

        return weighted_data


    def _get_potential_splits(self, data):
        """
        Get all potential splits for each feature
        Splits can be made on each unique value
        Can essentially make a split at each unique value
        
        """
        potential_splits = {}
        _, n_columns = data.shape
        for column_index in range(n_columns - 1): # excluding the last column which is the label
            values = data[:, column_index]
            unique_values = np.unique(values)

            potential_splits[column_index] = unique_values
            
        return potential_splits
    
    
    def _determine_best_split(self, data, sample_weight, potential_splits, ml_task):
        """
        Iterate over each column_index (as keys) in potential_split (dict)
        Perform split(of examples) over each unique value and evaluate the split
        Identify the best split and return its feature index and value
        """
        # Stitch data with sample_weight towards the end
        data = np.concatenate((data, np.expand_dims(sample_weight, axis=1)), axis=1)
        
        # Best minimum gini index to be updated iteratively
        best_overall_metric = float('inf')
        
        for column_index in potential_splits:
            #print(f"COLUMN {column_index}")
            for value in potential_splits[column_index]:
                #print(f'column_index: {column_index}, value: {value}')
                left_node_data, right_node_data = self._split_data(data, None, split_column_index=column_index, split_value=value)

                if ml_task == "regression":
                    current_overall_metric = self._calculate_weighted_overall_metric(data, left_node_data, right_node_data,
                                                                                     metric_function=self._mse_var)
                else: # classification
                    current_overall_metric = self._calculate_weighted_overall_metric(data, left_node_data, right_node_data,
                                                                                     metric_function=self._gini_sk)

                # If a lower overall_metric is achieved update the index and value with the current
                if current_overall_metric != None and current_overall_metric <= best_overall_metric:
                    best_overall_metric = current_overall_metric
                    best_split_column_index = column_index
                    best_split_value = value
                #print(f'best_overall_metric: {best_overall_metric}')
                #print('---')
            #print(f'Debug [1]: Best: {best_overall_metric}, index: {best_split_column_index}, value: {best_split_value}')
        return best_split_column_index, best_split_value, best_overall_metric
    
    
    def _split_data(self, data, sample_weight, split_column_index, split_value):
        """ 
        Split data(examples) based on best split_column_index and split_value
        estimated using task specific splitting metric.
        """
        # Stich sample_weight to data if passed as an argument or the assumption is, it has already been stiched/appended(in _determine_best_split())
        if sample_weight is not None:
            # Stitch data with sample_weight towards the end (axis=1)
            data = np.concatenate((data, np.expand_dims(sample_weight, axis=1)), axis=1)
        #else it is already appended 
        
        # Get values(from feature column) for the passed split_column index
        split_column_values = data[:, split_column_index]
        
        type_of_feature = self.feature_types[split_column_index]
        if type_of_feature == "continuous":
            left_node_data = data[split_column_values <= split_value]
            right_node_data = data[split_column_values >  split_value]

        # feature is categorical   
        else:
            left_node_data = data[split_column_values == split_value]
            right_node_data = data[split_column_values != split_value]
        return left_node_data, right_node_data
    
    
    def _create_leaf(self, data, ml_task):
        """
        Create leaf node, with leaf value based on ml_task
        for,
        Classfication: consider majority vote
        Regression: consider the mean value
        """
        label_column = data[:, -2]
        sample_weight = data[:, -1]
        
        if ml_task == "regression":
            #leaf = np.mean(label_column)
            #print(label_column)
            leaf = np.average(label_column, weights=sample_weight)

        # classfication    
        else:  
            # Decide leaf value based on sum of weights for each class in the node
            unique_classes, unique_cls_start_indices, counts_unique_classes = np.unique(label_column, return_counts=True, return_index=True)
            unique_class_aggregated_weights = np.array([np.take(sample_weight, np.where(label_column == label_column[value_index])[0]).sum() for value_index in unique_cls_start_indices])
            index = unique_class_aggregated_weights.argmax()
            leaf = unique_classes[index]

        return leaf
    
    
    def _determine_type_of_feature(self, X):
        """
        Determine, if the feature is categorical or continuous
        """
        feature_types = []
        n_unique_values_treshold = 15 # Threshold for a numeric feature to be categorical
        
        n_samples, n_features = np.shape(X)
        
        for feature_i in range(n_features):
            unique_values = np.unique(X[:, feature_i])
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")

        return feature_types
    

In [91]:
class AdaBoostClassifier(BaseBoostingAlgorithm):
    def __init__(self, n_learners=20):
        # Set total number of weak learners
        super().__init__(n_learners)
        self.ml_task = "classification"
        self.classes = None
        self.n_classes = None
        
    def boost(self, i_boost, data, sample_weight, learner):
        """
        Compute learner importance and error, along with boosted weights for each example 
        """
        print(f'Boost Called')
        
        # If its first boost initialize number of classes(n_classes)
        if i_boost == 0:
            self.classes = np.unique(data[:, -1])
            self.n_classes = self.classes.size
            
        # Perform predictions
        preds = self.stump_predict(data, learner)
        
        # Incorrectly classified examples
        incorrect = preds != data[:, -1]

        # Learner Error
        learner_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))
        # Stop if classification is perfect
        if learner_error <= 0:
            learner.weight = 1
            learner.error = 0
            return sample_weight, learner
        print(f'Learner error: {learner_error}')
        
        # Learner weight
        learner_weight = (np.log((1. - learner_error) / (learner_error)) +
                        np.log(self.n_classes - 1.))
        print(f'Learner weight: {learner_weight}')
        
        # Boost sample_weight for each each sample
        # Dont boost sample_weight if we are on final learner
        if not i_boost == self.n_learners - 1:
            # Boost only positive weights
            sample_weight *= np.exp(learner_weight * incorrect *
                                    ((sample_weight > 0) | (learner_weight < 0)))

        # Allocate learner its computed weight and error
        learner.weight = learner_weight
        learner.error = learner_error

        # Finally return sample weights and boosted learner
        return sample_weight, learner
         
    def predict(self, X):
        """
        Predict classes for X.
        """
        print(f'Predict called')
        
        # Get activated matrix for with respect to each learner [get vote of each learner]
        # Add each activated matrix (matrix addition) [get overall vote of all leaners]
        # return the overall matrix
        # Argmax is used over each row of overall matrix to figure our the class
        classes = self.classes[:, np.newaxis]
        pred = sum((self.stump_predict(X, learner) == classes).T * learner.weight
                   for learner in self.learners)
        # Normalize 
        learner_weights = sum(learner.weight for learner in self.learners)
        pred /= learner_weights
        
        # If its binary classification obatin the form [-, +], convienient to select classes with np.take() 
        # Eg(binary): classes =  [[c1], [c2]] and pred = [True, False, True], below output: [[c2], [c1], [c2]]
        if self.n_classes == 2:
            pred[:, 0] *= -1
            pred = pred.sum(axis=1)
            return classes.take(pred > 0, axis=0)
        # Finds index of column with max value, and uses this index to select class from classes

        return classes.take(np.argmax(pred, axis=1), axis=0)
        

In [92]:
class AdaBoostRegressor(BaseBoostingAlgorithm):
    def __init__(self, n_learners=20):
        # Set total number of weak learners
        super().__init__(n_learners)
        self.ml_task = "regression"
        
    def boost(self, i_boost, data, sample_weight, learner):
        """
        Compute learner importance and error, along with boosted weights for each example 
        """
        print(f'Boost Called')

        # Perform predictions
        preds = self.stump_predict(data, learner)
        
        error_vect = np.abs(preds - data[:, -1]) # Absolute residual
        error_max = error_vect.max() # Max error value for current split
        
        # Normalize error vector with max error
        if error_max != 0.:
            error_vect /= error_max
        # Calculate the average loss 
        # sum(Sample_weight * residual) # error value between 0<->1
        learner_error = (sample_weight * error_vect).sum() 
        # Return if error is 0 or less # Nothing to improve
        mse_error = accuracy_score_regression(reg_y_df.values, preds)
        print(f"mse_error: {mse_error}")
        if learner_error <= 0:
            learner.weight = 1
            learner.error = 0
            return sample_weight, learner
        print(f'Learner error: {learner_error}')
        # Eg: Learner error
        # learner_error = 0 (no error-> sum of residuals = 0)
        # learner_error = 0.5 (average error -> (sum of residuals)/(error_max*n))
        # Learner_error = 1 ((sum of residuals) == (error_max*n))
        
        # AdaBoost.R2
        # Eg: Beta
        # beta 1 or greater -> when learner error >= 0.5 and close to 1 (high error)
        # beta less than 1 -> when learner error < 0.5 and close to 0 (low error)
        beta = learner_error / (1. - learner_error)
        print(f'Beta: {beta}')
        # Eg: learner_weight
        # beta 1 or greater -> learner_weight is less/close to 0 (because of high error)
        # beta less than 1 -> learner_weight is more/away from 0 (bcause of low error) 
        learner_weight = np.log(1. / beta)
        print(f'Learner weight: {learner_weight}')
        
        if not i_boost == self.n_learners - 1:
            # raise beta using Normazlize error for each respective example(vector)
            sample_weight = sample_weight * np.power(beta, (1. - error_vect))
        
        # Allocate learner its computed weight and error
        learner.weight = learner_weight
        learner.error = learner_error

        # Finally return sample weights and boosted learner
        return sample_weight, learner
    
    def predict(self, X):
        """
        - Predictions are taken from each learner
        - These Predictions are ordered/sorted according to their magnitude to obtain their sorted indices
        - Also Learner weights are sorted according to previously sorted predictions
          for performing cummulutive sum over previously sorted learner weights for each example is performed
          for these respective rolling sum, weights >=0.5 are marked as true 
        - Index is noted when first 0.5 crossing is obtained, this index is 
          used to select a prediction from our orginal unsorted predictions 
        """
        print(f'Predict called')
        # Get predictions for each each example w.r.t each learner
        preds = np.array([self.stump_predict(X, learner) for learner in self.learners]).T
        
        # sort predictions(given by learners) for each example
        preds_sorted_idx = np.argsort(preds, axis=1)
        
        # Get weights of all the learners 
        learner_weights = np.array([learner.weight for learner in self.learners])
        # Order learner weights according to preds_sorted_idx and perform cumsum()
        weight_cdf = np.cumsum(learner_weights[preds_sorted_idx], axis=1)
        # for each example's prediction weights_cdf, when in crosses 0.5(inc) mark as true
        thresholded_weights = (weight_cdf >= 0.5) * weight_cdf[:, -1][:, np.newaxis]
        median_weight_idx = thresholded_weights.argmax(axis=1)
        
        median_leaners = preds_sorted_idx[np.arange(len(X)), median_weight_idx]
        
        return preds[np.arange(len(X)), median_leaners]
        

 var = \sum_i^n (y_i - y_bar) ** 2
     = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2

In [93]:
# clf = AdaBoostClassifier(20)
# clf = clf.fit(df.iloc[:, :-1].values, df.iloc[:, -1].values)

In [94]:
# predictions = clf.predict(df.iloc[:, :-1].values)
# accuracy_score_classification(df.iloc[:, -1].values, predictions[:, -1])

In [95]:
reg = AdaBoostRegressor(6)
reg = reg.fit(reg_X_df.iloc[:, 1:-1].values, reg_y_df.values)

['categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'categorical', 'continuous', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical']
regression
New Stump Created!
split_column_index: 8, split_value: 0.238461
Change in overall_metric: 3720239.552559671
Left leaf: 1592.833333333333, Right leaf: 4736.581979320478
[0.00136799 0.00136799 0.00136799 0.00136799 0.00136799]
Boost Called
mse_error: 1752.5706950971967
Learner error: 0.29591362726202547
Beta: 0.42028029332723593
Learner weight: 0.8668334252289138
0: Sample weight(sum) [Raw] 0.55394478461488
0: Sample weight(sum) [Normalized] 1.0
Total stumps: 1
New Stump Created!
split_column_index: 8, split_value: 0.238461
Change in overall_metric: 3720751.6701198043
Left leaf: 1611.68314633051, Right leaf: 4724.339580272245
[0.0020688083857496467 0.0021399948610897196 0.0010854887990750

In [88]:
predictions = reg.predict(reg_X_df.iloc[:, :-1].values)

Predict called


In [89]:
accuracy_score_regression(reg_y_df.values, predictions)

1748.6008254818623

In [57]:
r2_score(reg_y_df.values, predictions)

0.1737979154665379