In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

from tree import DecisionTreeRegressor

import warnings
warnings.simplefilter('ignore')

In [3]:
def grad_mse(y, preds):
    m = len(y)
    return (1 / m) * (y - preds)
    
def grad_cross_entropy(y, preds):
    m = len(y)
    
    preds = np.clip(preds, 1e-15, 1 - 1e-15)
    return - (y / preds) + (1 - y) / (1 - preds)
    
    # return (1 / m) * y * np.log(preds) + (1 - y) * np.log(preds)

loss_functions = {'mse': grad_mse, 'cross_entropy': grad_cross_entropy}

In [4]:
class GradientBoostingBase:
            
    def __init__(self, loss='mse', 
                 learning_rate=0.1, 
                 n_estimators=100, 
                 min_samples_split=2, 
                 max_depth=5):
        
        self.loss_fn = loss_functions[loss]
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
        self.trees = [DecisionTreeRegressor(min_samples_split=self.min_samples_split, 
                                            max_depth=self.max_depth) for _ in range(self.n_estimators)]
    
    def fit(self, X, y):

        self.base_preds = np.full(shape=y.shape, fill_value=np.mean(y, axis=0))
        preds = np.full(shape=y.shape, fill_value=np.mean(y))
        
        for tree in self.trees:
            grads = self.loss_fn(y, preds)
            tree.fit(X, grads)
            new_preds = tree.predict(X)
            update = self.learning_rate * new_preds
            preds = preds + update
        
    def predict(self, X):

        preds = self.base_preds
        for i, tree in enumerate(self.trees):
            preds = preds + (self.learning_rate * tree.predict(X))
            
        return preds
    
    
class GradientBoostingRegressor(GradientBoostingBase):
    
    def __init__(self, loss='mse', learning_rate=0.1, 
                 n_estimators=100, min_samples_split=2, 
                 max_depth=5):
        super(GradientBoostingRegressor, self).__init__(loss=loss, learning_rate=learning_rate, 
                                                        n_estimators=n_estimators, 
                                                        min_samples_split=min_samples_split, 
                                                        max_depth=max_depth)
                                    
class GradientBoostingClassifier(GradientBoostingBase):
    
    def __init__(self, loss='cross_entropy', learning_rate=0.1, 
                 n_estimators=100, min_samples_split=2, 
                 max_depth=5):
        super(GradientBoostingClassifier, self).__init__(loss=loss, learning_rate=learning_rate, 
                                                        n_estimators=n_estimators, 
                                                        min_samples_split=min_samples_split, 
                                                        max_depth=max_depth)
    
    def predict(self, X):

        preds = super(GradientBoostingClassifier, self).predict(X)
        exp = np.exp(preds - np.max(preds))
        preds = exp / np.sum(exp)
        return preds 

In [5]:
filename = "data/ex1data1.txt"
data = np.loadtxt(filename, delimiter=',', usecols=(0, 1), unpack=True)

X = np.transpose(np.array(data[:-1]))
y = np.transpose(np.array(data[-1:]))

In [6]:
gbr = GradientBoostingRegressor(n_estimators=3, min_samples_split=5, max_depth=5)

gbr.fit(X, y)
preds = gbr.predict(X)
print("MSE: ", np.mean((preds - y.ravel()) ** 2))

MSE:  29.912485243248675


In [7]:
# Sklearn way
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=3, min_samples_split=5, max_depth=5)
gbr.fit(X, y)
preds = gbr.predict(X)
print("MSE: ", np.mean((preds - y.ravel()) ** 2))

MSE:  18.435584703264723


In [8]:
# Load iris data.
data = load_iris()
dataset = {feature: data.data[:, i] for i, feature in enumerate(data.feature_names)}
dataset.update({'target': data.target})

df = pd.DataFrame(dataset)
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [9]:
# GBMClassifier doesn't work right now.

gbc = GradientBoostingClassifier(n_estimators=3, min_samples_split=5, max_depth=5)
gbc.fit(X.values, y.values)
preds = gbc.predict(X.values)
print((np.round(preds) == y).mean())

0.3333333333333333


In [10]:
# Sklearn way

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=3, min_samples_split=5, max_depth=5)
gbc.fit(X, y)
preds = gbc.predict(X)
print((preds == y).mean())

0.9866666666666667
