# Using Pipelines on Melbourn Housing Dataset

In [18]:
# Importing the required libraries
import pandas as pd
import numpy as np

In [19]:
# Reading the data into dataframe
df = pd.read_csv("melb_data.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [20]:
# Separating the features and the target variables
X = df.drop(["Price"], axis=1)
y = df["Price"]

In [21]:
# Splitting the data in train test datasets for validation
from sklearn.model_selection import train_test_split
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print("X_train_full: ", X_train_full.shape)
print("X_test_full: ", X_test_full.shape)
print("y_train: ", y_train.shape)
print("X_test: ", y_test.shape)

X_train_full:  (10864, 20)
X_test_full:  (2716, 20)
y_train:  (10864,)
X_test:  (2716,)


In [27]:
# Since we dont want to use all the columns. For this purpose, we'll split the columns into numeric and categorial.

# For numerical columns we can get the columns in a list using by looping through all the columns and checking their dtype
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes in ["int64", "float64"]]

# For categorical columns, we select the columns that have cardinality < 10
obj_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes=="object" and X_train_full[col].nunique() < 10]

# Combining the required num and obj cols
cols = num_cols + obj_cols
X_train = X_train_full[cols].copy()
X_test = X_test_full[cols].copy()

In [28]:
# Checking to verify all the required columns are added
X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
1041,3,11.2,3186.0,3.0,1.0,2.0,366.0,156.0,1920.0,-37.9038,145.0001,10579.0,h,S,Southern Metropolitan
1989,3,7.8,3058.0,3.0,1.0,0.0,238.0,131.0,1900.0,-37.7539,144.9612,11204.0,h,S,Northern Metropolitan
10157,3,5.2,3056.0,3.0,1.0,1.0,439.0,,,-37.77047,144.97005,11918.0,h,S,Northern Metropolitan
1711,2,11.4,3163.0,2.0,1.0,2.0,0.0,100.0,1973.0,-37.8863,145.066,7822.0,u,S,Southern Metropolitan
11565,4,11.0,3018.0,4.0,2.0,4.0,615.0,,,-37.87057,144.83623,5301.0,h,S,Western Metropolitan


### Preprocessing

Next step is to preprocess the data. In our case, we have to perform different preprocessing, e.g., for numeric cols we have to deal with the null values and in the case of obj cols we have to deal with the null values and also perform encoding.

For this purpose, ColumnTransformer will be used to for numeric columns to perform operations on the numeric columns and Pipeline will be used to perform operations on obj colums since there are multiple operations for obj columns, e.g., handling null values and encoding

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# First step is to prepare a SimpleImputer for numeric columns
num_imputer = SimpleImputer(strategy="constant")

# Next step is to prepare a pipeline for obj columns
obj_pipeline = Pipeline(steps=[("obj_imputer", SimpleImputer(strategy="most_frequent")),
                               ("one_hot_encoding", OneHotEncoder(handle_unknown="ignore"))
                              ])

preprocessing = ColumnTransformer(transformers=[("num", num_imputer, num_cols),
                                                    ("cat", obj_pipeline, obj_cols)
                                                    ])


In [30]:
# Next step would be to define a model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=1)

### Using Pipeline

The final step in the process would be to define a pipeline and provide the preprocessing and model to it.

In [32]:
my_pipeline = Pipeline(steps=[("preprocessing", preprocessing),
                              ("model", model)])

my_pipeline.fit(X_train, y_train)
y_pred = my_pipeline.predict(X_test)

In [33]:
# Finally, we use Mean Absolute Error to check the accuracy
from sklearn.metrics import mean_absolute_error

print("MAE: ", mean_absolute_error(y_test, y_pred))

MAE:  157178.0246242724
