In [1]:
import pandas as pd

### Loading Dataset

In [2]:
insurance_data = pd.read_csv('dataset/insurance_processed.csv')

insurance_data.sample(10)

Unnamed: 0,age,bmi,children,region,charges,sex_female,sex_male,smoker_no,smoker_yes
952,30,28.405,1,1,4527.18295,1,0,1,0
1211,39,34.1,2,2,23563.01618,0,1,1,0
979,36,29.92,0,2,4889.0368,1,0,1,0
476,24,28.5,0,0,35147.52848,0,1,0,1
568,49,31.9,5,3,11552.904,1,0,1,0
482,18,31.35,0,2,1622.1885,1,0,1,0
718,51,36.67,2,1,10848.1343,1,0,1,0
361,35,30.5,1,3,4751.07,0,1,1,0
705,33,32.9,2,3,5375.038,1,0,1,0
153,42,23.37,0,0,19964.7463,1,0,0,1


In [3]:
X = insurance_data.drop('charges', axis=1)

Y = insurance_data['charges']

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

##### Fit a DecisionTreeRegressor to the training set

In [5]:
from sklearn.tree import DecisionTreeRegressor

In [6]:
tree_reg1 = DecisionTreeRegressor(max_depth=3)
tree_reg1.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

##### Now train a second DecisionTreeRegressor on the residual errors made by the first predictor

In [7]:
y2 = y_train - tree_reg1.predict(x_train)

In [8]:
tree_reg2 = DecisionTreeRegressor(max_depth=3)
tree_reg2.fit(x_train, y2)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

##### Now we train a third regressor on the residual errors made by the second predictor

In [9]:
y3 = y2 - tree_reg2.predict(x_train)

In [10]:
tree_reg3 = DecisionTreeRegressor(max_depth=3)
tree_reg3.fit(x_train, y3)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

##### Now we have an ensemble containing three trees. It can make predictions on a new instance simply by adding up all the predictions of all the trees

In [11]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [12]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.8185827356090973

### GradientBoostingRegressor
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

Following code creates thye same ensemble as we created above

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

In [14]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=1.0)

gbr.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.8185827356090974