# Gradient Tree Boosting

Gradient Tree Boosting or Gradient Boosted Decision Trees (GBDT) is a generalization of boosting to arbitrary differentiable loss functions. GBDT is an accurate and effective off-the-shelf procedure that can be used for both regression and classification problems in a variety of areas including Web search ranking and ecology.

The module sklearn.ensemble provides methods for both classification and regression via gradient boosted decision trees.

https://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting


In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import graphviz 
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
import math

In [None]:
epa = pd.read_csv('https://raw.githubusercontent.com/sqlshep/SQLShepBlog/master/data/epaMpg.csv')

#Drop the row number
epa = epa.drop(epa.columns[[0]], axis=1)

#replace the "." in the column names with "_"
epa.columns = epa.columns.str.replace('.', '_')

# Drop useless columns
epa = epa.drop(epa.columns[[0,1,2]], axis=1)
epa = epa.drop(epa.columns[[3,9,11]], axis=1)

epa['Tested_Transmission_Type_Code']= epa['Tested_Transmission_Type_Code'].astype('category')    
epa['Drive_System_Code']= epa['Drive_System_Code'].astype('category')

#One hot encode categories
epa = pd.get_dummies(epa)

In [None]:
#epa_X = epa.iloc[:, epa.columns =='Weight']
epa_X = epa.iloc[:, epa.columns !='FuelEcon']
epa_y = epa.iloc[:, epa.columns =='FuelEcon']

In [None]:
# Split the training and test set 
X_train, X_test, y_train, y_test = train_test_split(epa_X, epa_y, test_size=0.20)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape )

In [None]:
epa_gb = GradientBoostingRegressor(n_estimators=500, max_leaf_nodes=-1,random_state=0)

In [None]:
epa_gb.fit(X_train, y_train)

In [None]:
epa_y_pred = epa_gb.predict(X_test)

In [None]:
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, epa_y_pred))

# The root mean squared error
print('Root Mean squared error: %.2f'
      % math.sqrt(mean_squared_error(y_test, epa_y_pred)))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, epa_y_pred))

In [None]:
epa_gb.estimators_[0,0]

In [None]:
dot_data = tree.export_graphviz(epa_gb.estimators_[0,0], out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("EPA_XGB") 

In [None]:
epa_gb.estimators_.shape