In [1]:
import numpy as np
import pandas as pd

## Load and prepare data [Regression]

https://www.kaggle.com/marklvl/bike-sharing-dataset

In [2]:
reg_df = pd.read_csv("Bike.csv", parse_dates=["dteday"])
reg_df = reg_df.drop(["instant", "casual", "registered"], axis=1)
reg_df = reg_df.rename({"dteday": "date"}, axis=1)

In [3]:
reg_df.head(3)

Unnamed: 0,date,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349


In [4]:
reg_X_df = reg_df.iloc[:, :-1] # Input raw df
reg_y_df = reg_df.iloc[:, -1] # Output raw df

In [5]:
# Derive new features from date column
date_column = reg_X_df.date

reg_X_df["day_of_year"] = date_column.dt.dayofyear
reg_X_df["day_of_month"] = date_column.dt.day
reg_X_df["quarter"] = date_column.dt.quarter
reg_X_df["week"] = date_column.dt.week
reg_X_df["is_month_end"] = date_column.dt.is_month_end
reg_X_df["is_month_start"] = date_column.dt.is_month_start
reg_X_df["is_quarter_end"] = date_column.dt.is_quarter_end
reg_X_df["is_quarter_start"] = date_column.dt.is_quarter_start
reg_X_df["is_year_end"] = date_column.dt.is_year_end
reg_X_df["is_year_start"] = date_column.dt.is_year_start

In [6]:
reg_X_df.head(3)

Unnamed: 0,date,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,...,day_of_year,day_of_month,quarter,week,is_month_end,is_month_start,is_quarter_end,is_quarter_start,is_year_end,is_year_start
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,...,1,1,1,52,False,True,False,True,False,True
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,...,2,2,1,52,False,False,False,False,False,False
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,...,3,3,1,1,False,False,False,False,False,False


In [7]:
reg_y_df.head(3)

0     985
1     801
2    1349
Name: cnt, dtype: int64

In [8]:
data = pd.concat([reg_X_df.iloc[:, 1:], reg_y_df], axis=1)

In [9]:
class DecisionNode():
    def __init__(self):
        self.decision_feature_index = None
        self.decision_feature_value = None 
        self.left_node = None
        self.right_node = None
        self.comparator_type_str = None

In [10]:
class DecisionTree():
    def __init__(self):
        self.tree_depth = 0
        self.leaves_count = 0
        self.feature_types = None
        self.potential_splits = None
        
    def tree_builder(self, data, ml_task, max_depth=5, max_leaves_count=8, min_samples=2, BoostingTypeObject=None, current_depth=0):
        #name = "pratik"
        #self._create_leaf(alg, name)
        if current_depth == 0:
            self.feature_types = self._determine_type_of_features(data)
            self.potential_splits = self._get_potential_splits(data)
            
        # Make leaf or split
        if (self._check_purity(data)) or (len(data) < min_samples) or (current_depth >= max_depth) or (self.leaves_count >= max_leaves_count):
            leaf = self._create_leaf(data, ml_task)
            self.leaves_count += 1
            return leaf
        else:
            # increase current_depth since we are spliting a node
            current_depth += 1
            
            # Perform greedy search and find the best split feature and value
            split_column, split_value = self._determine_best_split(data, self.potential_splits, ml_task)
            data_below, data_above = self._split_data(data, split_column, split_value)
            
            if len(data_below) == 0 or len(data_above) == 0:
                leaf = self._create_leaf(data, ml_task)
                self.leaves_count += 1
                return leaf
            
            type_of_feature = self.feature_types[split_column]
            
            # Recursive call # Splits until limits or leaves reached
            yes_answer = self.tree_builder(data_below, ml_task, max_depth, max_leaves_count, min_samples, BoostingTypeObject, current_depth)
            no_answer = self.tree_builder(data_above, ml_task, max_depth, max_leaves_count, min_samples, BoostingTypeObject, current_depth)
            
            if not isinstance(yes_answer, DecisionNode) and not isinstance(no_answer, DecisionNode):
                yes_answer.sort()
                no_answer.sort()
                if yes_answer == no_answer:
                    # since both the answers are same, reduce increased increased depth by one and also over
                    current_depth -= 1 
                    return yes_answer
                
            if self.tree_depth < current_depth: # Note final tree depth
                self.tree_depth = current_depth
        
            # Instantiate a Decision node to store the output of current recursion
            decision_node = DecisionNode()
            decision_node.decision_feature_index = split_column
            decision_node.decision_feature_value = split_value
            decision_node.comparator_type_str = type_of_feature
            decision_node.left_node = yes_answer
            decision_node.right_node = no_answer
            
            return decision_node

    def _determine_type_of_features(self, X):
        """
        Determine, if the feature is categorical or continuous
        """
        feature_types = []
        n_unique_values_treshold = 15 # Threshold for a numeric feature to be categorical
        
        n_samples, n_features = np.shape(X)
        
        for feature_i in range(n_features):
            unique_values = np.unique(X[:, feature_i])
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")

        return feature_types
    
        
    def _get_potential_splits(self, X):
    
        potential_splits = {}
        _, n_columns = X.shape
        for column_index in range(n_columns - 1):
            values = X[:, column_index]
            unique_values = np.unique(values)

            potential_splits[column_index] = unique_values

        return potential_splits
        
        
    def _calculate_mse(self, data):
        actual_values = data[:, -1]
        if len(actual_values) == 0:   # empty data
            mse = 0

        else:
            prediction = np.mean(actual_values)
            mse = np.mean((actual_values - prediction) **2)

        return mse
    
        
    def _calculate_entropy(self, data):
    
        label_column = data[:, -1]
        _, counts = np.unique(label_column, return_counts=True)

        probabilities = counts / counts.sum()
        entropy = sum(probabilities * -np.log2(probabilities))

        return entropy
    

    def _calculate_overall_metric(self, data_below, data_above, metric_function):
    
        n = len(data_below) + len(data_above)
        p_data_below = len(data_below) / n
        p_data_above = len(data_above) / n

        overall_metric =  (p_data_below * metric_function(data_below) 
                         + p_data_above * metric_function(data_above))

        return overall_metric
    
    
    def _determine_best_split(self, data, potential_splits, ml_task):
    
        # Best minimum gini index to be updated iteratively
        best_overall_metric = float('inf')
        
        for column_index in potential_splits:
            for value in potential_splits[column_index]:
                data_below, data_above = self._split_data(data, split_column=column_index, split_value=value)

                if ml_task == "regression":
                    current_overall_metric = self._calculate_overall_metric(data_below, data_above, metric_function=self._calculate_mse)
                else: # classification
                    current_overall_metric = self._calculate_overall_metric(data_below, data_above, metric_function=self._calculate_entropy)
                    
                # If a lower overall_metric is achieved update the index and value with the current
                if current_overall_metric != None and current_overall_metric <= best_overall_metric:

                    best_overall_metric = current_overall_metric
                    best_split_column = column_index
                    best_split_value = value

        return best_split_column, best_split_value
    
    
    def _split_data(self, data, split_column, split_value):
    
        split_column_values = data[:, split_column]

        type_of_feature = self.feature_types[split_column]
        
        if type_of_feature == "continuous":
            data_below = data[split_column_values <= split_value]
            data_above = data[split_column_values >  split_value]
  
        else: # feature is categorical 
            data_below = data[split_column_values == split_value]
            data_above = data[split_column_values != split_value]

        return data_below, data_above 


    def _check_purity(self, data):
    
        label_column = data[:, -1]
        unique_classes = np.unique(label_column)

        if len(unique_classes) == 1:
            return True
        else:
            return False
        
    
#     def _create_leaf(self, data, ml_task):
        
#         label_column = data[:, -1]
#         if ml_task == "regression":
#             leaf = np.mean(label_column)

#         # classfication    
#         else:
#             unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
#             index = counts_unique_classes.argmax()
#             leaf = unique_classes[index]

#         return leaf

    def _create_leaf(self, data, ml_task):
        
        label_column = data[:, -1]
        leaf = np.unique(label_column).tolist()
#         if ml_task == "regression":
#             leaf = np.mean(label_column)

#         # classfication    
#         else:
#             unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)
#             index = counts_unique_classes.argmax()
#             leaf = unique_classes[index]

        return leaf
    
#     def _create_leaf(self, alg, name):#, data):
#         alg._create_leaf(name)

In [11]:
dt = DecisionTree()

In [12]:
tre = dt.tree_builder(data.values, 'regression', max_depth=5, max_leaves_count=8, min_samples=2, BoostingTypeObject=None)

In [44]:
from pprint import pprint

In [13]:
tre.decision_feature_index

7

In [14]:
tre.decision_feature_value

0.43083299999999997

In [32]:
class GradientBoostingRegressor():
    def __init__(self):
        self.decision_trees = []
        
    def _create_leaf(self, name):
        print("YOO HOO!", name)
        
    def fit(self):
        print("fit() called!")
        dt = DecisionTree()
        dt.tree_builder(self)
        
    def predict(self):
        pass
    

In [33]:
gbr = GradientBoostingRegressor()

In [34]:
gbr.fit()

fit() called!
YOO HOO! pratik


In [36]:
hu = np.array([4, 2, 4, 2])

In [37]:
unique_classes, counts_unique_classes = np.unique(hu, return_counts=True)
index = counts_unique_classes.argmax()
leaf = unique_classes[index]

In [38]:
leaf

2

In [39]:
index

0

In [40]:
unique_classes

array([2, 4])

In [44]:
[1, 2] == [2, 1]

False

In [45]:
[1, 2] == [1, 2]

True

In [57]:
[1, 2].sort() == [2, 1, 3].sort()

True

In [64]:
(['a', 'b'].sort()) == (['b', 'a', 'd'].sort())

True

In [55]:
np.all([[1,2],[2,1,1]], axis=0)

[2, 1, 1]

In [65]:
hh = ['a', 'b']

In [58]:
hh.sort()

In [66]:
h1 = ['b', 'a', 'd']

In [69]:
h1.sort()

In [61]:
hh == h1

False

In [67]:
hh

['a', 'b']

In [71]:
h1

['a', 'b', 'd']