# Housing Pipeline

In [22]:
import pandas as pd
import numpy as np

# https://numpy.org/doc/stable/reference/arrays.scalars.html
# https://numpy.org/doc/stable/reference/arrays.dtypes.html
# https://www.bbc.co.uk/bitesize/guides/zscvxfr/revision/3

# I'm type casting the data to save space in memory.
# Although the full CSV it's just ~1MB, it's generally useful to set a fitting type for each attribute.
housing = pd.read_csv(
    'housing.csv',
    dtype={
        'longitude': np.float32,
        'latitude': np.float32,
        'housing_median_age': np.uint8,
        'population': np.uint16,
        'households': np.uint16,
        'median_income': np.float32,
        'median_house_value': np.float32,
        'ocean_proximity': 'category'
    }
)

housing = housing.assign(
    # used to create stratified sampling for the training and test set
    median_income_categories = pd.cut(
        housing['median_income'],
        bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
        labels=[1, 2, 3, 4, 5]
    ),
    rooms_per_household = housing["total_rooms"] / housing["households"],
    bedrooms_per_room = housing["total_bedrooms"] / housing["total_rooms"],
    population_per_household = housing["population"] / housing["households"]
)
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   longitude                 20640 non-null  float32 
 1   latitude                  20640 non-null  float32 
 2   housing_median_age        20640 non-null  uint8   
 3   total_rooms               20640 non-null  float64 
 4   total_bedrooms            20433 non-null  float64 
 5   population                20640 non-null  uint16  
 6   households                20640 non-null  uint16  
 7   median_income             20640 non-null  float32 
 8   median_house_value        20640 non-null  float32 
 9   ocean_proximity           20640 non-null  category
 10  median_income_categories  20640 non-null  category
 11  rooms_per_household       20640 non-null  float64 
 12  bedrooms_per_room         20433 non-null  float64 
 13  population_per_household  20640 non-null  floa

## Training and Test split

Split the Training and the Test sets baed on the `median_income_categories` attribute

In [28]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing['median_income_categories']):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]

## Training

Remove the labels from the other attributes

In [45]:
X_train = train_set.drop('median_house_value', axis=1)
y_train = train_set[['median_house_value']].copy()

Separate the numerical from the categorical attributes

In [54]:
num_attributes = housing.drop(['ocean_proximity', 'median_income_categories', 'median_house_value'], axis=1).columns
cat_attributes = housing[['ocean_proximity']].columns

In [55]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

pipeline.fit_transform(X_train)

array([[-0.94135   ,  1.34743845,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178042, -1.19243959, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758072, -0.1259721 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707948 ,  1.31001766,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.5608015 ,  1.24921155, -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28104994,  2.02567507, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])