In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

from tqdm import trange

---

In [2]:
data = pd.read_csv('California_Houses.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Median_House_Value        20640 non-null  float64
 1   Median_Income             20640 non-null  float64
 2   Median_Age                20640 non-null  int64  
 3   Tot_Rooms                 20640 non-null  int64  
 4   Tot_Bedrooms              20640 non-null  int64  
 5   Population                20640 non-null  int64  
 6   Households                20640 non-null  int64  
 7   Latitude                  20640 non-null  float64
 8   Longitude                 20640 non-null  float64
 9   Distance_to_coast         20640 non-null  float64
 10  Distance_to_LA            20640 non-null  float64
 11  Distance_to_SanDiego      20640 non-null  float64
 12  Distance_to_SanJose       20640 non-null  float64
 13  Distance_to_SanFrancisco  20640 non-null  float64
dtypes: flo

In [3]:
data.sample(3)

Unnamed: 0,Median_House_Value,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco
13047,112500.0,2.6981,35,7088,1279,4885,1272,38.55,-121.28,58964.246479,569262.022494,747735.816857,145353.367917,132579.089667
3114,87200.0,4.8494,5,5735,932,2623,862,35.61,-117.66,193205.093923,181221.898393,325083.083178,424292.162475,488659.520693
13179,292400.0,6.7172,3,9662,1385,2497,856,33.98,-117.76,41206.686745,45297.516932,151185.871182,527754.346326,595745.520381


---

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Median_House_Value', axis=1), data['Median_House_Value'], test_size=0.2, random_state=2601)

In [5]:
DTR = DecisionTreeRegressor(random_state=2601, max_depth=12)
DTR.fit(X_train, y_train)
mse(y_test, DTR.predict(X_test))

3762068957.3267994

---

In [67]:
class GradientBoostingRegressor:
    def __init__(self,
               n_estimators=100,
               learning_rate=0.1,
               max_depth=3,
               max_features=None,
               random_state=2601):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []

    def _is_fitted(self):
        return len(getattr(self, "estimators_", [])) > 0

    def fit(self, X, y):
        # DTR = DecisionTreeRegressor(random_state=self.random_state, max_depth=self.max_depth, max_features=self.max_features).fit(X, y)
        # self.trees.append(DTR)
        # y_pred = y - DTR.predict(X)
        y_pred = np.full(np.shape(y), np.mean(y, axis=0))

        for _ in trange(self.n_estimators):
            resid = -(y - y_pred)
            DTR = DecisionTreeRegressor(max_depth=self.max_depth).fit(X, resid)
            y_pred -= self.learning_rate * DTR.predict(X)
            self.trees.append(DTR)


    def predict(self, X):

        pred = np.array([])
        for tree in self.trees:
            pred = -tree.predict(X) if not pred.any() else pred - self.learning_rate * tree.predict(X)

        return pred

In [68]:
GBR = GradientBoostingRegressor()
GBR.fit(X_train, y_train)
mse(y_test, GBR.predict(X_test))

100%|██████████| 100/100 [00:04<00:00, 21.60it/s]


51008333325.05446

---

In [32]:
class GradientBoostingRegressor:
    def __init__(self,
               n_estimators=100,
               learning_rate=0.1,
               max_depth=3,
               max_features=None,
               random_state=2601):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.max_features = max_features
        self.init_estimator_ = None
        self.estimators_ = []

    def _is_fitted(self):
        return len(getattr(self, "estimators_", [])) > 0

    def fit(self, X, y):
        self.init_estimator_ = DecisionTreeRegressor(max_depth=self.max_depth, max_features=self.max_features).fit(X, y)
        prev_res = 2 * (y - self.init_estimator_.predict(X))

        for _ in trange(self.n_estimators):
            DTR = DecisionTreeRegressor(max_depth=self.max_depth, max_features=self.max_features).fit(X, prev_res)
            prev_res -= self.learning_rate * 2 * DTR.predict(X)
            self.estimators_.append(DTR)
    
    def predict(self, X):
        prev_pred = self.init_estimator_.predict(X)
        for tree in self.estimators_:
            prev_pred += self.learning_rate * tree.predict(X)
        return prev_pred

In [33]:
GBR = GradientBoostingRegressor()
GBR.fit(X_train, y_train)
mse(y_test, GBR.predict(X_test))

100%|██████████| 100/100 [00:04<00:00, 21.36it/s]


2587864134.35594

---