# Decision Trees

https://scikit-learn.org/stable/modules/tree.html#regression
    
are a **non-parametric** supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple **decision rules** inferred from the data features. A tree can be seen as a piecewise constant approximation.

# EPA Dataset

In [None]:
!pip install graphviz
!sudo apt-get install graphviz -y

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import tree
%matplotlib inline

In [None]:
epa = pd.read_csv('https://raw.githubusercontent.com/sqlshep/SQLShepBlog/master/data/epaMpg.csv')

#Drop the row number
epa = epa.drop(epa.columns[[0]], axis=1)

#replace the "." in the column names with "_"
epa.columns = epa.columns.str.replace('.', '_')

# Drop useless columns
epa = epa.drop(epa.columns[[0,1,2]], axis=1)
epa = epa.drop(epa.columns[[3,9,11]], axis=1)

epa['Tested_Transmission_Type_Code']= epa['Tested_Transmission_Type_Code'].astype('category')    
epa['Drive_System_Code']= epa['Drive_System_Code'].astype('category')

#One hot encode categories
epa = pd.get_dummies(epa)

In [None]:
#epa_X = epa.iloc[:, epa.columns =='Weight']
epa_X = epa.iloc[:, epa.columns !='FuelEcon']
epa_y = epa.iloc[:, epa.columns =='FuelEcon']

In [None]:
# Split the training and test set 
X_train, X_test, y_train, y_test = train_test_split(epa_X, epa_y, test_size=0.20)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape )

In [None]:
epa_tree = tree.DecisionTreeRegressor()

In [None]:
epa_tree = epa_tree.fit(X_train, y_train)

In [None]:
#tree.plot_tree(epa_tree) 

In [None]:
#! pip install graphviz 
import graphviz 
dot_data = tree.export_graphviz(epa_tree, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("EPA") 

In [None]:
epa_y_pred = epa_tree.predict(X_test)

In [None]:

# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, epa_y_pred))

# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, epa_y_pred))