# Pipelines

Pipelines bundle preprocessing and modelling steps so that the whole bundle can be used as if it were a single step

In [34]:
# Loading Data and splitting
# Melbourne Housing Dataset

import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('~/kaggle/input/melbourne-housing-snapshot/melb_data.csv')

# Separate Target from Predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Split the data into training and validation sets
X_train_full, X_valid_full, y_train, y_test = train_test_split(X, y,
                                                              train_size=0.8,
                                                              test_size=0.2,
                                                              random_state=0)

In [35]:
# Select cols with low cardinality
categorical_cols = [cname for cname in X_train_full.columns
                       if X_train_full[cname].nunique()<10 and
                       X_train_full[cname].dtype == "object"]

# Select numerical cols
numerical_cols = [cname for cname in X_train_full.columns
                 if X_train_full[cname].dtype in ['int64', 'float64']]

In [36]:
# Select predictors cols and eliminate unselected predictors
selected_cols = categorical_cols + numerical_cols

# Modify X_train_full and X_valid_full
# Copy selected columns to a dataframes X_train and X_valid ---- pandas.DataFrame.copy()
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()

In [37]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


## Building Pipilines

The pipeline is buld in three steps

- **Define Preprocessing Steps**

    Use `ColumnTransformer` to bundle preprocessing steps together
    
    
- **Define the model**

    Use a `RandomForestRegressor` model
    
    
- **Create and evaluate the pipeline**

    Use class `Pipeline` to define a pipeline that bundles preprocessing and modelling steps together

### Define Preprocessing Steps

In [38]:
# STEP 1: DEFINE PREPROCESSING STEPS

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [39]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

In [40]:
# Preprocessing for categorical data

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [41]:
# Bundle preprocessing for numerical and categorical data

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
    )

### Define the model

In [42]:
# STEP 2: DEFINE THE MODEL

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

### Creating and evaluating pipeline

In [43]:
# STEP 3: CREATING AND EVALUATING PIPELINE

from sklearn.metrics import mean_absolute_error

# Bundle preprocesing and modelling code in a pipeline
my_pipeline = Pipeline( steps=[
    ('preprocessor', preprocessor), ('model', model)])

# END OF STEP 3

In [44]:
# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
 

In [45]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

In [46]:
# Evaluate the model
print('MAE: {}'.format(mean_absolute_error(y_test, preds)))

MAE: 160679.18917034855
