In [4]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

from sklearn.metrics import mean_squared_error

In [184]:
class GBM():
    
    def __init__(self):
        
        self.X_train = None
        self.y_train = None
        
        self.alpha = None
        
        self.residuals = None
        self.mean = None
        self.trees = []
        
        self.n_trees = None
        
    def set_model(self, X_train, y_train, alpha=0.2, n_trees=100):
        
        self.X_train = X_train
        self.y_train = y_train
        
        self.mean = np.mean(y_train)
        
        self.alpha = alpha
        self.n_trees = n_trees
        
        self.residuals = np.zeros((n_trees, len(y_train)))
        
    
    
    
    def weighted_sum_of_prev_res(self, last_i):
        
        predictions = self.mean
        
        for i in range(last_i):
    
            predictions += alpha * self.trees[i].predict(self.X_train)
        
        return predictions
    
    
    
    def fit(self):
        
        self.residuals[0] = (y_train - self.mean)
        
        for i in range(1, self.n_trees):
            #build tree to predict previous residuals
            self.trees.append(DecisionTreeRegressor(max_depth=1))
            self.trees[i-1].fit(X_train, self.residuals[i-1])
            
            #identify new residuals
            self.residuals[i] = (self.y_train - self.weighted_sum_of_prev_res(i))
    
    def predict(self, X):
        
        predictions = self.mean
        for i in range(self.n_trees-1):
    
            predictions += self.alpha * self.trees[i].predict(X)
        
        return predictions
        
        
        
    
        
        
        
        

In [213]:
gbm = GBM()
gbm.set_model(X_train, y_train, alpha=0.15, n_trees = 200)
gbm.fit()

### Let's see  our GBM in action

In [206]:
#dataset

df = pd.read_csv("Student_Marks.csv")

X = df.drop("Marks", axis=1)
y = df["Marks"]


X_train = X[:70].values
y_train = y[:70].values
X_test = X[70:].values
y_test = y[70:].values




In [214]:

train_compar = np.array([y_train, 
                         gbm.predict(X_train)]).T


train_compar = pd.DataFrame(train_compar, columns=["y_train", "y_train_predict"])

In [215]:
train_compar

Unnamed: 0,y_train,y_train_predict
0,19.202,20.596330
1,7.734,12.064762
2,13.811,16.346403
3,53.018,45.599212
4,55.299,47.390718
...,...,...
65,13.562,16.653540
66,27.569,25.965716
67,6.185,11.183548
68,8.920,12.703731


In [216]:
test_compar = np.array([y_test,
                        gbm.predict(X_test)]).T

test_compar = pd.DataFrame(test_compar, columns=["Y_test", "GBM prediction"])

In [217]:
test_compar

Unnamed: 0,Y_test,GBM prediction
0,16.606,18.34027
1,13.416,14.623623
2,20.398,21.530107
3,7.014,12.164463
4,39.952,34.34497
5,6.217,11.183548
6,36.746,34.596104
7,38.278,33.689027
8,49.544,43.664269
9,6.349,11.183548


In [218]:
from sklearn.metrics import r2_score

print("R^2 score on train set is ", r2_score(y_true=y_train,
                                            y_pred=gbm.predict(X_train)))


print("R^2 score on test set is ", r2_score(y_true=y_test,
                                            y_pred=gbm.predict(X_test)))




R^2 score on train set is  0.9356578714659238
R^2 score on test set is  0.928150444107194
