In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

## Load and prepare data [Regression]

https://www.kaggle.com/marklvl/bike-sharing-dataset

In [2]:
reg_df = pd.read_csv("Bike.csv", parse_dates=["dteday"])
reg_df = reg_df.drop(["instant", "casual", "registered"], axis=1)
reg_df = reg_df.rename({"dteday": "date"}, axis=1)

In [3]:
reg_df.head(3)

Unnamed: 0,date,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349


In [4]:
reg_X_df = reg_df.iloc[:, :-1] # Input raw df
reg_y_df = reg_df.iloc[:, -1] # Output raw df

In [5]:
# Derive new features from date column
date_column = reg_X_df.date

reg_X_df["day_of_year"] = date_column.dt.dayofyear
reg_X_df["day_of_month"] = date_column.dt.day
reg_X_df["quarter"] = date_column.dt.quarter
reg_X_df["week"] = date_column.dt.week
reg_X_df["is_month_end"] = date_column.dt.is_month_end
reg_X_df["is_month_start"] = date_column.dt.is_month_start
reg_X_df["is_quarter_end"] = date_column.dt.is_quarter_end
reg_X_df["is_quarter_start"] = date_column.dt.is_quarter_start
reg_X_df["is_year_end"] = date_column.dt.is_year_end
reg_X_df["is_year_start"] = date_column.dt.is_year_start

In [6]:
reg_X_df.head(3)

Unnamed: 0,date,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,...,day_of_year,day_of_month,quarter,week,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,...,1,1,1,52,False,True,False,True,False,True
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,...,2,2,1,52,False,False,False,False,False,False
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,...,3,3,1,1,False,False,False,False,False,False


In [7]:
reg_y_df.head(3)

0     985
1     801
2    1349
Name: cnt, dtype: int64

In [13]:
class DecisionNode():
    def __init__(self):
        self.decision_feature_index = None
        self.decision_feature_value = None 
        self.left_node = None
        self.right_node = None
        self.comparator_type_str = None

In [14]:
class DecisionTree():
    def __init__(self):
        self.tree_depth = 0
        self.leaves_count = 0
        self.feature_types = None
        self.potential_splits = None
        
    def _tree_builder(self, data, ml_task, max_depth, max_leaves_count, min_samples, current_depth=0):
        # Make leaf or split
        if (self._check_purity(data)) or (len(data) < min_samples) or (current_depth >= max_depth) or (self.leaves_count >= max_leaves_count):
            #leaf = self._create_leaf(data, ml_task, previous_estimator) # Gradient boost adaption
            leaf = self._create_leaf(data, ml_task)
            self.leaves_count += 1
            return leaf
        else:
            # increase current_depth since we are spliting a node
            current_depth += 1
            
            # Perform greedy search and find the best split feature and value
            split_column, split_value = self._determine_best_split(data, self.potential_splits, ml_task)
            data_below, data_above = self._split_data(data, split_column, split_value)
            
            if len(data_below) == 0 or len(data_above) == 0:
                #leaf = self._create_leaf(data, ml_task, previous_estimator) # Gradient boost adaption
                leaf = self._create_leaf(data, ml_task)
                self.leaves_count += 1
                return leaf
            
            type_of_feature = self.feature_types[split_column]
            
            # Recursive call # Splits until limits or leaves reached
            yes_answer = self._tree_builder(data_below, ml_task, max_depth, max_leaves_count, min_samples, current_depth)
            no_answer = self._tree_builder(data_above, ml_task, max_depth, max_leaves_count, min_samples, current_depth)
            
            if not isinstance(yes_answer, DecisionNode) and not isinstance(no_answer, DecisionNode):
                yes_answer.sort()
                no_answer.sort()
                if yes_answer == no_answer:
                    # since both the answers are same, reduce increased increased depth by one and also over
                    current_depth -= 1 
                    return yes_answer
                
            if self.tree_depth < current_depth: # Note final tree depth
                self.tree_depth = current_depth
        
            # Instantiate a Decision node to store the output of current recursion
            decision_node = DecisionNode()
            decision_node.decision_feature_index = split_column
            decision_node.decision_feature_value = split_value
            decision_node.comparator_type_str = type_of_feature
            decision_node.left_node = yes_answer
            decision_node.right_node = no_answer
            
            return decision_node
    
    def build_tree(self, X, y, preds, residuals, ml_task, max_depth=5, max_leaves_count=8, min_samples=2):
        self.num_of_features = X.shape[1]
        data = np.c_[X, y, preds, residuals] # The tree is built of residuals , while y and pred are just kep for residual and leaf computation purpose
        self.feature_types = self._determine_type_of_features(data)
        self.potential_splits = self._get_potential_splits(data)
        tree = self._tree_builder(data, ml_task, max_depth, max_leaves_count, min_samples)
        return tree
    
    def _determine_type_of_features(self, X):
        """
        Determine, if the feature is categorical or continuous
        """
        feature_types = []
        n_unique_values_treshold = 15 # Threshold for a numeric feature to be categorical
        
        for feature_i in range(self.num_of_features):
            unique_values = np.unique(X[:, feature_i])
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")

        return feature_types
    
        
    def _get_potential_splits(self, X):
    
        potential_splits = {}
        
        for column_index in range(self.num_of_features):
            values = X[:, column_index]
            unique_values = np.unique(values)

            potential_splits[column_index] = unique_values

        return potential_splits
        
        
    def _calculate_mse(self, data):
        actual_values = data[:, -1]
        if len(actual_values) == 0:   # empty data
            mse = 0

        else:
            prediction = np.mean(actual_values)
            mse = np.mean((actual_values - prediction) **2)

        return mse
    
        
    def _calculate_entropy(self, data):
    
        label_column = data[:, -1]
        _, counts = np.unique(label_column, return_counts=True)

        probabilities = counts / counts.sum()
        entropy = sum(probabilities * -np.log2(probabilities))

        return entropy
    

    def _calculate_overall_metric(self, data_below, data_above, metric_function):
    
        n = len(data_below) + len(data_above)
        p_data_below = len(data_below) / n
        p_data_above = len(data_above) / n

        overall_metric =  (p_data_below * metric_function(data_below) 
                         + p_data_above * metric_function(data_above))

        return overall_metric
    
    
    def _determine_best_split(self, data, potential_splits, ml_task):
    
        # Best minimum gini index to be updated iteratively
        best_overall_metric = float('inf')
        
        for column_index in potential_splits:
            for value in potential_splits[column_index]:
                data_below, data_above = self._split_data(data, split_column=column_index, split_value=value)

                if ml_task == "regression":
                    current_overall_metric = self._calculate_overall_metric(data_below, data_above, metric_function=self._calculate_mse)
                else: # classification
                    current_overall_metric = self._calculate_overall_metric(data_below, data_above, metric_function=self._calculate_entropy)
                    
                # If a lower overall_metric is achieved update the index and value with the current
                if current_overall_metric != None and current_overall_metric <= best_overall_metric:

                    best_overall_metric = current_overall_metric
                    best_split_column = column_index
                    best_split_value = value

        return best_split_column, best_split_value
    
    
    def _split_data(self, data, split_column, split_value):
    
        split_column_values = data[:, split_column]

        type_of_feature = self.feature_types[split_column]
        
        if type_of_feature == "continuous":
            data_below = data[split_column_values <= split_value]
            data_above = data[split_column_values >  split_value]
  
        else: # feature is categorical 
            data_below = data[split_column_values == split_value]
            data_above = data[split_column_values != split_value]

        return data_below, data_above 


    def _check_purity(self, data):
    
        label_column = data[:, -1]
        unique_classes = np.unique(label_column)

        if len(unique_classes) == 1:
            return True
        else:
            return False
        
    
#     def _create_leaf(self, data, ml_task):
        
#         label_column = data[:, -1]
#         if ml_task == "regression":
#             leaf = np.mean(label_column)

#         # classfication    
#         else:
#             unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
#             index = counts_unique_classes.argmax()
#             leaf = unique_classes[index]

#         return leaf

#     #def _create_leaf(self, data, ml_task, previous_estimator):
#     def _create_leaf(self, data, ml_task):#, previous_estimator):
#         #leaf = self.create_leaf(data, ml_task, previous_estimator)
        
#         #return leaf
#         label_column = data[:, -1]
# #        #leaf = np.unique(label_column).tolist()
#         if ml_task == "regression":
#             leaf = np.mean(label_column)

#         # classfication    
#         else:
#             unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
#             index = counts_unique_classes.argmax()
#             leaf = unique_classes[index]

#         return [leaf]


    def _create_leaf(self, data, ml_task):
        leaf = self.create_leaf(data, ml_task)
        return [leaf]
    
    def print_tree(self, tree):
        node_dict = {}
        if not tree:
            return
        
        if isinstance(tree, list):
            return tree
        
        decision_feature_index = tree.decision_feature_index
        decision_feature_value = tree.decision_feature_value
        left_value = tree.left_node
        right_value = tree.right_node
        comparator_type_str = tree.comparator_type_str
        
        comparator_operator_str = ""
        if comparator_type_str == "continuous":
            comparator_operator_str = "<="
        else:
            comparator_operator_str = "=="
            
        yes_answer = self.print_tree(left_value)
        no_answer = self.print_tree(right_value)
    
        key = f'{decision_feature_index} {comparator_operator_str} {decision_feature_value}'
        node_dict[key] = [yes_answer, no_answer]
        
        return node_dict
    
    def calculate_r_squared(self, y_true, y_preds):
        mean = np.mean(np.array(y_true))
        ss_res = sum((y_true - y_preds) ** 2)
        ss_tot = sum((y_true - mean) ** 2)
        r_squared = 1 - ss_res / ss_tot

        return r_squared
    
    def predict_example(self, example, tree):
        question = list(tree.keys())[0]

        feature_index, comparison_operator, value = question.split(" ")
        feature_index = int(feature_index) 

        # ask question
        if comparison_operator == "<=":
            if example[feature_index] <= float(value):
                answer = tree[question][0]
            else:
                answer = tree[question][1]

        # feature is categorical
        else:
            if str(example[feature_index]) == value:
                answer = tree[question][0]
            else:
                answer = tree[question][1]

        # base case
        if not isinstance(answer, dict):
            return answer[0]

        # recursive part
        else:
            residual_tree = answer
            return self.predict_example(example, residual_tree)
        
    def predict(self, X, raw_tree):
        # Generate parseable tree [json]
        tree = self.print_tree(raw_tree)
        #data = np.c_[X, y]

        preds = np.apply_along_axis(func1d=self.predict_example, arr=data, axis=1, tree=tree)

        return preds

In [None]:
class BoostingConstantEstimator():
    def __init__(self, task_type = None):
        self.task_type = task_type
        self.predictions = None
        self.residuals = None
        
        
    def _compute_residuals(self, y, pred):
        return y - pred
    
    
    def _compute_log_odds(self, y):
        y_postives = y.sum()
        y_negatives = y.size - y_postives
        return np.log(y_postives / y_negatives)
    
    
    def _convert_log_odds_to_prob(self, log_odds):
        return np.exp(log_odds) / (1 + np.exp(log_odds))
    
    
    def _compute_constant_predictions(self, y):
        if self.task_type == 'regression':
            # Compute avg value
            pred = np.mean(y)
            self.predictions = np.repeat(pred, len(y)) ##
        else: # self.task_type == 'classification'
            log_pred = self._compute_log_odds(y) # log preds
            pred = self._convert_log_odds_to_prob(log_pred)
            self.predictions = np.repeat(log_pred, len(y)) # log predictions
            self.prob_pred = np.repeat(pred, len(y)) # probability predictions ##
        return pred
    
    
    def boost_cycle(self, y):
        preds = self._compute_constant_predictions(y)
        residuals = self._compute_residuals(y, preds)
        self.residuals = residuals ##
        

In [4]:
class BoostingTreeEstimator(DecisionTree):
    # A boosting estimator can be constant or a tree
    
    def __init__(self, task_type=None): # Note: while task_type can be either [regression, classification]
        self.learning_rate = None
        self.task_type = task_type
        self.predictions = None
        self.residuals = None
        self.tree = None # useable tree [classification]log tree
        
        
    def _compute_residuals(self, y, pred):
        return y - pred
    
    
    def _convert_log_odds_to_prob(self, log_odds):
        return np.exp(log_odds) / (1 + np.exp(log_odds))
    
    
    def create_leaf(self, data, ml_task):
        y_values = data[:, -1]
        if ml_task == "regression":
            leaf = np.mean(y_values)
        else: # Its a 'classification' task
            # Convert and select from prob leaf value to log odds leaf value
            prev_prob_predictions = data[:, -2]
            residuals_sum = np.sum(y_values)
            prob_odds_multiplicative_sum = np.sum(prev_prob_predictions * (1 - prev_prob_predictions))
            leaf = residuals_sum / prob_odds_multiplicative_sum # log odds leaf value
            
        return leaf

    def compute_predictions(self, X, generated_estimators):
        constant_estimator = generated_estimators[0]
        tree_estimators = generated_estimators[1:]
        
        predictions = constant_estimator.predictions
        for tree_estimator in tree_estimators:
            predictions += (self.learning_rate * self.predict(X, tree_estimator))
        
        self.predictions = predictions
        
        if self.task_type = "classification":
            # Convert log predictions to probability predictions
            self.prob_pred = self._convert_log_odds_to_prob(predictions)
            return self.prob_pred
        else: # if its regression
            return self.predictions
                     
        
    def boost_cycle(self, X, y, preds, residuals, ml_task, max_depth, max_leaves_count, min_samples, generated_estimators, learning_rate=0.8):
        # build tree over residuals
        self.learning_rate = learning_rate
        tree = self.tree_builder(X, y, preds, residuals, ml_task, max_depth, max_leaves_count, min_samples)
        updated_generated_estimators = generated_estimators.append(tree)
        preds = self.compute_predictions(X, updated_generated_estimators)
        residuals = self._compute_residuals(y, preds)
        self.residuals = residuals
        

In [23]:
yala = np.array([1, 1, 1])

In [22]:
yolo = np.array([[2, 2, 2], [1, 2, 3], [1, 1, 1]])

In [24]:
base_y = yala

In [25]:
for yo in yolo:
    base_y += yo

In [26]:
base_y

array([5, 6, 7])

In [None]:
#(self, X, y, preds, residuals, ml_task, max_depth=5, max_leaves_count=8, min_samples=2):

In [None]:
#data = pd.concat([reg_X_df.iloc[:, 1:], reg_y_df], axis=1)

In [15]:
pp = np.repeat(111, len(reg_y_df))
rt = np.repeat(222, len(reg_y_df))

In [16]:
dtx = DecisionTree()

In [17]:
trex = dtx.build_tree(reg_X_df.iloc[:, 1:].values, pp, rt, reg_y_df.values, 'regression', max_depth=6, max_leaves_count=32, min_samples=15)

In [19]:
trex.decision_feature_index, trex.decision_feature_value, dtx.tree_depth

(7, 0.43083299999999997, 6)

In [20]:
predictions2 = dtx.predict(reg_X_df.iloc[:, 1:].values, reg_y_df.values, trex)

In [21]:
dtx.calculate_r_squared(reg_y_df.values, predictions2)

0.8943929233165495

In [18]:
xt= np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

In [25]:
yt=np.array([11, 22, 33])

In [26]:
pt=np.array([111, 222, 333])

In [27]:
rt=np.array([1111, 2222, 3333])

In [28]:
np.c_[xt, yt, pt, rt]

array([[   1,    2,    3,   11,  111, 1111],
       [   4,    5,    6,   22,  222, 2222],
       [   7,    8,    9,   33,  333, 3333]])

In [28]:
tyx = np.array([1, 3])

In [30]:
tyx

2.0

In [22]:
ddf = np.array([[1,2], [3, 4]])

In [24]:
len(ddf)

2

In [18]:
ddt = np.array([1])

In [21]:
np.repeat(ddt, 15)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [16]:
def convert_log_odds_to_prob2(log_odds):
    return np.exp(log_odds) / (1 + np.exp(log_odds))

In [17]:
convert_log_odds_to_prob2(dee)

array([0.73105858, 0.73105858, 0.5       , 0.5       , 0.73105858,
       0.5       , 0.73105858, 0.73105858, 0.5       , 0.5       ,
       0.73105858])

In [15]:
np.exp()

2.718281828459045

In [14]:
np.log(1.7)

0.5306282510621704

In [10]:
dee = np.array([1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1])

In [12]:
dee.sum()

6

In [13]:
dee.size - dee.sum()

5

In [11]:
np.count_nonzero(dee == 1)

6

In [12]:
data = pd.concat([reg_X_df.iloc[:, 1:], reg_y_df], axis=1)

In [18]:
dt = DecisionTree()

In [19]:
tre = dt.tree_builder(data.values, 'regression', max_depth=6, max_leaves_count=32, min_samples=15)#, #BoostingTypeObject=None)

In [20]:
tre.decision_feature_index, tre.decision_feature_value, dt.tree_depth

(7, 0.43083299999999997, 6)

In [21]:
# tr = dt.print_tree(tre)

In [22]:
predictions = dt.predict(reg_X_df.iloc[:, 1:].values, reg_y_df.values, tre)

In [23]:
dt.calculate_r_squared(reg_y_df.values, predictions)

0.8943929233165495

In [145]:
class GradientBoostingRegressor():
    def __init__(self):
        self.decision_trees = []
        self.ml_task = "regression"
        self.average_prediction
        
    def _create_leaf(self, name):
        print("YOO HOO!", name)
        
    def fit(self):
        print("fit() called!")
        dt = DecisionTree()
        dt.tree_builder(self)
        
    def predict(self):
        pass
    

In [None]:
class GradientBoostingClassifier():
    def __init__(self):
        self.decision_trees = []
        self.ml_task = "classification"
        

In [33]:
gbr = GradientBoostingRegressor()

In [34]:
gbr.fit()

fit() called!
YOO HOO! pratik


In [29]:
df1 = pd.DataFrame({"A": [1, 2, 3, 4], "B": [11, 22, 33, 44]})

In [30]:
df1

Unnamed: 0,A,B
0,1,11
1,2,22
2,3,33
3,4,44


In [33]:
nd = df1.values
nd

array([[ 1, 11],
       [ 2, 22],
       [ 3, 33],
       [ 4, 44]], dtype=int64)

In [34]:
def fn(ex):
    return ex[0] + ex[1]

In [39]:
np.apply_along_axis(fn, 1, nd)

array([12, 24, 36, 48], dtype=int64)