In [236]:
%matplotlib inline

import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from six.moves import urllib
from sklearn.model_selection import StratifiedShuffleSplit

URL = "http://7xncmx.com1.z0.glb.clouddn.com/california-housing-prices.zip"
LOCAL_PATH = 'datasets'

## PART I - Load the original dataset

In [237]:
'''
##################################
PART I - Load the original dataset
##################################
'''
def fetch_housing_price(from_url=URL, to_local_dir=LOCAL_PATH):
     # 1.Create the dictionary if not exist
    if not os.path.isdir(to_local_dir):
        os.makedirs(to_local_dir)

    # 2.Download the zip file
    local_zip_path = os.path.join(to_local_dir, 'housing.zip')
    urllib.request.urlretrieve(from_url, local_zip_path)

    # 3.Unzip to the local directory
    unzip_housing = zipfile.ZipFile(local_zip_path, 'r')
    unzip_housing.extractall(to_local_dir)
    unzip_housing.close()

def load_housing_data(local_dir = LOCAL_PATH):
    csv_path = os.path.join(local_dir, 'housing.csv')
    return pd.read_csv(csv_path) # DataFrame

housing = load_housing_data() # The original sample collections

## PART II - Classsify the `median_income` column

In [238]:
'''
##############################################
PART II - Classsify the `median_income` column

Here we add a new column `income_category` 
to the dataset.
##############################################
'''
housing['income_category'] = np.ceil(housing['median_income'] / 1.5)
housing['income_category'].where(
    housing['income_category'] < 5, 5.0, inplace = True)

## PART III - Separate training and testing data

In [239]:
'''
##############################################
PART III - Separate training and testing data
##############################################
'''
split = StratifiedShuffleSplit(
    n_splits = 1, test_size = 0.2, random_state = 52)

for train_index, test_index in split.split(housing, housing['income_category']): 
    s_train_set = housing.loc[train_index]
    s_test_set = housing.loc[test_index]

for coll in (s_train_set, s_test_set):
    coll.drop('income_category', axis = 1, inplace = True)

## PART IV - Generate features and labels from training dataset

In [240]:
'''
##############################################
PART IV - Generate features and labels from
training dataset.
##############################################
'''
housing_features = s_train_set.drop('median_house_value', axis = 1)
housing_labels = s_train_set['median_house_value'].copy()

## PART V - Learn about Scikit-Learn built-in transformer

In [241]:
'''
##############################################
PART V - Learn about Scikit-Learn built-in
transformer.

We use a transformer called Imputer to fill
missing data in `housing_features`.
##############################################
'''
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')


housing_nums = housing_features.drop('ocean_proximity', axis = 1)

imputer.fit(housing_nums) # Calculate the median value of each column
housing_features_filled = imputer.transform(housing_nums) # Transform housing_features


## PART VI - How to customize a transformer

In [242]:
'''
##############################################
PART VI - How to customize a transformer

We use `CombinedAttributes` to add 3 extra
columns: 

* bedrooms / room
* rooms / household
* population / household
##############################################
'''
from sklearn.base import BaseEstimator, TransformerMixin

rooms_idx, bedrooms_idx, population_idx, households_idx = 3, 4, 5, 6

class CombinedAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_idx] / X[:, households_idx]
        population_per_household = X[:, population_idx] / X[:, households_idx]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_idx] / X[:, rooms_idx]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributes(add_bedrooms_per_room=True)
housing_features_extra = attr_adder.transform(housing_features.values)
pd.DataFrame(housing_extra_attribs).corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,-0.924893,-0.108106,0.044616,0.07168,0.098545,0.05584,-0.020587,-0.025876,-0.000482,0.094251
1,-0.924893,1.0,0.012467,-0.034361,-0.067184,-0.106626,-0.070546,-0.074447,0.104107,0.005907,-0.116333
2,-0.108106,0.012467,1.0,-0.364241,-0.323741,-0.297245,-0.306518,-0.11826,-0.148181,0.015669,0.137663
3,0.044616,-0.034361,-0.364241,1.0,0.930726,0.853973,0.917643,0.194526,0.133721,-0.025092,-0.185577
4,0.07168,-0.067184,-0.323741,0.930726,1.0,0.873738,0.978565,-0.010324,0.005862,-0.028573,0.086297
5,0.098545,-0.106626,-0.297245,0.853973,0.873738,1.0,0.904712,0.004168,-0.068703,0.071699,0.037048
6,0.05584,-0.070546,-0.306518,0.917643,0.978565,0.904712,1.0,0.011031,-0.077235,-0.027762,0.067354
7,-0.020587,-0.074447,-0.11826,0.194526,-0.010324,0.004168,0.011031,1.0,0.318398,0.022263,-0.617473
8,-0.025876,0.104107,-0.148181,0.133721,0.005862,-0.068703,-0.077235,0.318398,1.0,-0.004245,-0.406868
9,-0.000482,0.005907,0.015669,-0.025092,-0.028573,0.071699,-0.027762,0.022263,-0.004245,1.0,0.003495


## PART VII - How to handle non numeric values in dataset

In [243]:
'''
##############################################
PART VII - How to handle non numeric values in
dataset.
##############################################
'''

# Get statistics information about the column
housing_o = housing['ocean_proximity']
housing_o_encoded, housing_o_categories = housing_o.factorize()

'''
Transform by the built-in OneHotEncoder
'''
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
reshaped = housing_o_encoded.reshape(-1, 1)
housing_o_1hot = encoder.fit_transform(reshaped)
housing_o_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

### We copy `CategoricalEncoder` from ScikitLearn 0.20 dev branch

In [244]:
'''
************************************************************************
* The `CategoricalEncoder` is copied from ScikitLearn 0.20 dev branch. *
* We don't have to understand the details of its implementation.       *
************************************************************************
'''
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be an array-like of integers or
    strings, denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot (aka one-of-K or dummy)
    encoding scheme (``encoding='onehot'``, the default) or converted
    to ordinal integers (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories must be sorted and should not mix
          strings and numeric values.
        The used categories can be found in the ``categories_`` attribute.
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros. In the inverse transform, an unknown category
        will be denoted as None.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting
        (in order corresponding with output of ``transform``).
    Examples
    --------
    Given a dataset with two features, we let the encoder find the unique
    values per feature and transform the data to a binary one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
    >>> enc.fit(X)
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.categories_
    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
    >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
    array([[1., 0., 1., 0., 0.],
           [0., 1., 0., 0., 0.]])
    >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
    array([['Male', 1],
           [None, 2]], dtype=object)
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        if self.categories != 'auto':
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet "
                                     "supported")

        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                if self.handle_unknown == 'error':
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(self.categories[i])

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using specified encoding scheme.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            Xi = X[:, i]
            valid_mask = np.in1d(Xi, self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    Xi = Xi.copy()
                    Xi[~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(Xi)

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        feature_indices = np.cumsum(n_values)

        indices = (X_int + feature_indices[:-1]).ravel()[mask]
        indptr = X_mask.sum(axis=1).cumsum()
        indptr = np.insert(indptr, 0, 0)
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csr_matrix((data, indices, indptr),
                                shape=(n_samples, feature_indices[-1]),
                                dtype=self.dtype)
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [245]:
'''
We can use `CategoricalEncoder` to make a 
1hot transformation directly.
'''
cat_encoder = CategoricalEncoder()
housing_features_reshaped = housing_o.values.reshape(-1, 1)
housing_features_1hot = cat_encoder.fit_transform(housing_features_reshaped)

In [246]:
housing_features

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5864,-118.34,34.18,45.0,3046.0,633.0,1448.0,599.0,3.2400,<1H OCEAN
4350,-118.37,34.11,42.0,5518.0,979.0,1863.0,957.0,8.5842,<1H OCEAN
10745,-117.91,33.61,36.0,3082.0,455.0,771.0,365.0,11.2160,<1H OCEAN
618,-122.15,37.73,45.0,3758.0,819.0,1573.0,736.0,2.8355,NEAR BAY
7169,-118.18,34.06,33.0,278.0,71.0,266.0,56.0,0.8941,<1H OCEAN
19445,-121.02,37.69,19.0,3814.0,790.0,2219.0,804.0,3.5208,INLAND
9897,-122.27,38.29,36.0,1446.0,306.0,678.0,295.0,2.8409,NEAR BAY
18247,-122.07,37.39,30.0,1695.0,480.0,932.0,447.0,3.5045,NEAR BAY
432,-122.29,37.88,48.0,2365.0,490.0,1034.0,475.0,3.1065,NEAR BAY
8776,-118.32,33.80,29.0,3254.0,717.0,1593.0,680.0,4.0536,<1H OCEAN


In [247]:
housing_features_filled

array([[-1.1834e+02,  3.4180e+01,  4.5000e+01, ...,  1.4480e+03,
         5.9900e+02,  3.2400e+00],
       [-1.1837e+02,  3.4110e+01,  4.2000e+01, ...,  1.8630e+03,
         9.5700e+02,  8.5842e+00],
       [-1.1791e+02,  3.3610e+01,  3.6000e+01, ...,  7.7100e+02,
         3.6500e+02,  1.1216e+01],
       ...,
       [-1.1837e+02,  3.4090e+01,  2.2000e+01, ...,  1.7660e+03,
         1.1700e+03,  3.1517e+00],
       [-1.2140e+02,  3.8530e+01,  3.8000e+01, ...,  6.5000e+01,
         3.5000e+01,  9.2740e-01],
       [-1.2245e+02,  3.7750e+01,  3.5000e+01, ...,  1.7860e+03,
         3.0100e+02,  3.0804e+00]])

In [248]:
housing_features_extra

array([[-118.34, 34.18, 45.0, ..., 5.085141903171953, 2.4173622704507514,
        0.2078135259356533],
       [-118.37, 34.11, 42.0, ..., 5.765935214211076, 1.9467084639498433,
        0.1774193548387097],
       [-117.91, 33.61, 36.0, ..., 8.443835616438356, 2.1123287671232878,
        0.14763140817650877],
       ...,
       [-118.37, 34.09, 22.0, ..., 3.62991452991453, 1.5094017094017094,
        0.29503178714386624],
       [-121.4, 38.53, 38.0, ..., 4.3428571428571425, 1.8571428571428572,
        0.19736842105263158],
       [-122.45, 37.75, 35.0, ..., 4.528239202657807, 5.9335548172757475,
        0.2215700660308144]], dtype=object)

In [249]:
housing_features_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

## How can we merge all the above operations together?

In [252]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributes()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_nums)
housing_num_tr

array([[ 0.6095637 , -0.67609026,  1.30208678, ..., -0.13483487,
        -0.05882111, -0.09540914],
       [ 0.59459102, -0.70898432,  1.06309724, ...,  0.13363754,
        -0.09944349, -0.5794448 ],
       [ 0.82417203, -0.94394184,  0.58511815, ...,  1.18967375,
        -0.08514871, -1.05382616],
       ...,
       [ 0.59459102, -0.71838262, -0.53016638, ..., -0.70870714,
        -0.13718767,  1.29356599],
       [-0.91764908,  1.36804016,  0.74444452, ..., -0.42755658,
        -0.10717394, -0.26175035],
       [-1.44169268,  1.00150643,  0.50545497, ..., -0.35445073,
         0.24466335,  0.12366759]])

In [254]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [255]:
num_attribs = list(housing_nums)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributes()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
])


In [256]:
from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
housing_features

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5864,-118.34,34.18,45.0,3046.0,633.0,1448.0,599.0,3.2400,<1H OCEAN
4350,-118.37,34.11,42.0,5518.0,979.0,1863.0,957.0,8.5842,<1H OCEAN
10745,-117.91,33.61,36.0,3082.0,455.0,771.0,365.0,11.2160,<1H OCEAN
618,-122.15,37.73,45.0,3758.0,819.0,1573.0,736.0,2.8355,NEAR BAY
7169,-118.18,34.06,33.0,278.0,71.0,266.0,56.0,0.8941,<1H OCEAN
19445,-121.02,37.69,19.0,3814.0,790.0,2219.0,804.0,3.5208,INLAND
9897,-122.27,38.29,36.0,1446.0,306.0,678.0,295.0,2.8409,NEAR BAY
18247,-122.07,37.39,30.0,1695.0,480.0,932.0,447.0,3.5045,NEAR BAY
432,-122.29,37.88,48.0,2365.0,490.0,1034.0,475.0,3.1065,NEAR BAY
8776,-118.32,33.80,29.0,3254.0,717.0,1593.0,680.0,4.0536,<1H OCEAN


In [308]:
housing_prepared = full_pipeline.fit_transform(housing_features)

In [309]:
housing_prepared

array([[ 0.6095637 , -0.67609026,  1.30208678, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.59459102, -0.70898432,  1.06309724, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.82417203, -0.94394184,  0.58511815, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.59459102, -0.71838262, -0.53016638, ...,  0.        ,
         0.        ,  0.        ],
       [-0.91764908,  1.36804016,  0.74444452, ...,  0.        ,
         0.        ,  0.        ],
       [-1.44169268,  1.00150643,  0.50545497, ...,  0.        ,
         1.        ,  0.        ]])

In [310]:
housing_prepared.shape

(16512, 16)

## Train model by `LinearRegression` 

In [402]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [403]:
housing.iloc[:5]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_category
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3.0


In [404]:
some_data=housing_features.iloc[:5]
some_prepared = full_pipeline.transform(some_data)
housing_predictions = lin_reg.predict(some_prepared)

In [405]:
print("Predictions:", housing_predictions)

Predictions: [224576. 462592. 529664. 229824. 102592.]


In [406]:
some_labels = housing_labels.iloc[:5]
print("Labels:", list(some_labels))

Labels: [226900.0, 500001.0, 500001.0, 245400.0, 98200.0]


In [407]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

In [408]:
lin_rmse

68549.58930722883

In [409]:
housing['median_house_value'].mean()

206855.81690891474

In [410]:
housing_labels.mean()

207248.90836967053

## Train model by `DecisionTreeRegressor` 

In [411]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [412]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_mse = np.sqrt(tree_mse)

In [413]:
tree_mse

0.0

## The Cross Validation

In [414]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
print(tree_rmse_scores)

[65867.67517596 73098.12973001 67993.81009907 71472.54462923
 71021.69010992 67802.2541816  74536.63624677 69020.98350458
 68610.26955017 71023.81419179]


In [415]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [416]:
display_scores(tree_rmse_scores)

Scores: [65867.67517596 73098.12973001 67993.81009907 71472.54462923
 71021.69010992 67802.2541816  74536.63624677 69020.98350458
 68610.26955017 71023.81419179]
Mean: 70044.78074190943
Standard deviation: 2514.76985129591


In [417]:
lin_scores = cross_val_score(lin_reg, 
                             housing_prepared, 
                             housing_labels, 
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [69365.44518281 69977.32503167 68099.18069008 67998.27389653
 72810.83551695 65790.26514406 69404.99732733 65470.44370747
 69646.27630104 69198.58316275]
Mean: 68776.16259606933
Standard deviation: 2007.6448179508736


## Train model by `RandomForestRegressor` 

In [418]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [419]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)

In [420]:
forest_rmse

21955.756438907018

In [421]:
forest_scores = cross_val_score(forest_reg, 
                             housing_prepared, 
                             housing_labels, 
                             scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [53468.31869273 54439.86893061 51550.59641978 51567.24495588
 53021.06810854 50103.64430756 56942.73613272 47857.10321075
 53615.21115494 52976.9299667 ]
Mean: 52554.27218802153
Standard deviation: 2349.941757095915
