In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import fetch_california_housing

In [8]:
class GradientBoostingMachine:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def _negative_gradient(self, y, y_pred):
        return y - y_pred

    def fit(self, X, y):
        self.trees = []
        y_pred = np.full(y.shape, np.mean(y))  # Initial prediction is the mean of y
        
        for _ in range(self.n_estimators):
            residuals = self._negative_gradient(y, y_pred)
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            self.trees.append(tree)
            y_pred += self.learning_rate * tree.predict(X)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0]) + np.mean(y)
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred


In [17]:
# Load dataset
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train GBM model
model = GradientBoostingMachine(n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train.values, y_train.values)

In [14]:
# Predictions
y_pred = model.predict(X_test.values)
y_pred

array([0.50179884, 1.08995724, 4.24232079, ..., 4.67842418, 0.8499066 ,
       1.95936342])

In [13]:
# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.294089415153173
