# Housing Pipeline

In [61]:
import pandas as pd
import numpy as np

# https://numpy.org/doc/stable/reference/arrays.scalars.html
# https://numpy.org/doc/stable/reference/arrays.dtypes.html
# https://www.bbc.co.uk/bitesize/guides/zscvxfr/revision/3

# I'm type casting the data to save space in memory.
# Although the full CSV it's just ~1MB, it's generally useful to set the right type for each attribute.
housing = pd.read_csv(
    'housing.csv',
    dtype={
        'longitude': np.float32,
        'latitude': np.float32,
        'housing_median_age': np.uint8,
        'population': np.uint16,
        'households': np.uint16,
        'median_income': np.float32,
        'median_house_value': np.float32,
        'ocean_proximity': 'category'
    }
)

housing = housing.assign(
    # used to create stratified sampling for the training and test set
    median_income_categories = pd.cut(
        housing['median_income'],
        bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
        labels=[1, 2, 3, 4, 5]
    )
)

## Training, Validation and Test split

Split the original dataset in 3 subsets in order to generate the Training, Validation and Test sets, stratifying the samples baed on the `median_income_categories` attribute.

In [62]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for training_index, test_index in split.split(housing, housing['median_income_categories']):
    training_set = housing.loc[training_index]
    test_set = housing.loc[test_index]

for training_index, validation_index in split.split(training_set, training_set['median_income_categories']):
    training_set = housing.loc[training_index]
    validation_set = housing.loc[validation_index]

Separate the label attribute from the rest of the dataset

In [63]:
X_training = training_set.drop('median_house_value', axis=1)
y_training = training_set[['median_house_value']].copy()

X_validation = validation_set.drop('median_house_value', axis=1)
y_validation = validation_set[['median_house_value']].copy()

X_test = test_set.drop('median_house_value', axis=1)
y_test = test_set[['median_house_value']].copy()

## Pipeline

Create a custom transformer

In [88]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeaturesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['rooms_per_household'] = X['total_rooms'] / X['households']
        X['bedrooms_per_room'] = X['total_bedrooms'] / X['total_rooms']
        X['population_per_household'] = X['population'] / X['households']
        return X

List numerical and categorical attribute names

In [100]:
label = 'median_house_value'

num_attributes = np.array(housing.drop([
    'ocean_proximity', # categorical
    'median_income_categories', # categorical
    label
], axis=1).columns)

num_attributes = np.append(num_attributes, ['rooms_per_household', 'bedrooms_per_room', 'population_per_household'])

cat_attributes = ['ocean_proximity']

In [101]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes),
])

pipeline = Pipeline([
    ('features_adder', FeaturesAdder()),
    ('column_transformer', column_transformer)
])

Transform the **Training** set

In [102]:
pipeline.fit_transform(X_training)

array([[-1.74245559,  1.19599336,  1.53645742, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.9497955 , -0.99423992, -1.04712607, ...,  0.        ,
         0.        ,  1.        ],
       [-1.62811225,  1.23903297,  1.77132865, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.49762065, -0.53993333,  1.14500538, ...,  0.        ,
         0.        ,  0.        ],
       [-1.20712447,  1.61682491, -0.73396443, ...,  0.        ,
         0.        ,  0.        ],
       [-0.47948829,  1.79376472, -0.81225484, ...,  0.        ,
         0.        ,  0.        ]])

Fit and predict with **Linear Regression**

In [103]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
regression = lin_reg.fit(X_training, y_training)

ValueError: Input contains NaN

In [40]:
regression.score(X_training_transformed, y_training)

0.6317681030307358

In [41]:
regression.coef_

array([[-52221.01888317, -55344.85860461,  14084.95498277,
          4117.06747897,   8561.7460345 , -46797.984934  ,
         39549.15601785,  77616.54636622,   7161.98914326,
         17331.92509365,    322.77703972, -16002.40424786,
        -50249.31984802, 106084.38909803, -20456.50023774,
        -19376.16476442]])

In [42]:
regression.intercept_

array([229671.22666063])

In [43]:
predictions = regression.predict(X_validation_transformed)

In [45]:
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(y_validation, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

67511.67216078292

In [47]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_training_transformed, y_training)
tree_reg.score(X_training_transformed, y_training)

1.0

In [48]:
tree_predictions = tree_reg.predict(X_validation_transformed)

In [49]:
tree_mse = mean_squared_error(y_validation, tree_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

97953.89241777746