# Decision Trees Regressor

Demonstrating the decision trees model with the california housing dataset and DecisionTreeRegressor class in sklearn

In [15]:
#importing important libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

#to visualize the tree
from sklearn import tree
from sklearn.tree import export_text

In [4]:
np.random.seed(306)

In [8]:
# Using shuffle split as the cross validation strategy
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

## Loading the dataset

In [14]:
# fetch dataset
features, labels = fetch_california_housing(return_X_y=True, as_frame=True)

#train-test split (combined training set)
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels)

#Further splitting of train set into train and development set
train_features, dev_features, train_labels, dev_labels = train_test_split(com_train_features, com_train_labels)

## Model Setup

In [34]:
dt_reg_pipeline = Pipeline([ ('feature_scaling', StandardScaler()),
                            ('dt_reg', DecisionTreeRegressor(max_depth=2, random_state=2)) ])

dt_reg_cv_results = cross_validate(dt_reg_pipeline, 
                                   com_train_features, 
                                   com_train_labels,
                                  cv = cv,
                                  scoring='neg_mean_absolute_error',
                                  return_train_score=True,
                                  return_estimator=True)

dt_reg_train_error = -1 * dt_reg_cv_results['train_score']
dt_reg_test_error = -1 * dt_reg_cv_results['test_score']

print('Mean absolute error on the train set: {:.3f} +/- {:.3f}\n'.format(dt_reg_train_error.mean(),dt_reg_train_error.std()))
print('Mean absolute error on the train set: {:.3f} +/- {:.3f}\n'.format(dt_reg_train_error.mean(),dt_reg_train_error.std()))

Mean absolute error on the train set: 0.656 +/- 0.002

Mean absolute error on the train set: 0.656 +/- 0.002



<b>sklearn.model_selection.cross_validate</b> returns a dictionary containing fit-times, score-times (and optionally training scores as well as fitted estimators) in addition to the test score.

## Visualizing the tree