# 7.PipeLines - putting all together

[Pipeline Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../00.datasets/car-sales-extended-missing-data.csv')

In [3]:
data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [4]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [5]:
# check for missing values
data.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

# steps we want to do.
1. Fill missing data
2. Convert data to numerical values
3. Build model on the data

### Import libraries

In [6]:
# Getting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV


### Load data

In [7]:
# Import data
data = pd.read_csv('../00.datasets/car-sales-extended-missing-data.csv')
data = data.dropna(subset=['Price'])

In [8]:
data.isnull().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

### Set up Pipeline to handle missing values and converting to numerical values

In [9]:
# Define categorical features
categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(
    steps=[
         ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
         ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [10]:
# Define door feature
door_features = ['Doors']
door_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value=4))
    ]
)

In [11]:
# Define numerical features
numerical_features = ['Odometer (KM)']
numerical_transformer = Pipeline(
    steps = [
         ('imputer', SimpleImputer(strategy='mean'))
    ]
)

### Setup ColumnTransformer for those defined pipelines

In [12]:
# Set up Preprocessing steps (fill missing value, convert to numerical values)
preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_features),
    ('door', door_transformer, door_features),
    ('numerical', numerical_transformer, numerical_features)
])

### Create preprocessing and modelling pipeline

In [13]:
model = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ]
)

### Split data

In [14]:
X = data.drop('Price', axis=1)
y = data['Price']


# set up numpy random seed
import numpy as np
np.random.seed(42 )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model training and Score the model

In [15]:
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.21528416466346045

### HyperParameter tuning with Pipeline

In [16]:
from sklearn.model_selection import GridSearchCV

pipe_grid = {
    'preprocessor__numerical__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 500, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['auto'],
    'model__min_samples_split':[2, 4]
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean, total=   0.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean, total=   0.5s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_de

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=median, total=   3.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=median, total=   3.4s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=median, total=   3.6s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=median 
[

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean, total=   3.1s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean, total=   2.7s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean, total=   2.5s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__

[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median, total=   1.0s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median, total=   1.0s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median, total=   0.9s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_depth=5, 

[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numerical__imputer__strategy=mean, total=   0.9s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numerical__imputer__strategy=mean 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numerical__imputer__strategy=mean, total=   0.9s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median, total=   0.9s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numerical__imputer__strategy=median 
[CV]  model__max_depth=5, model

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  2.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehotencoder',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('d

In [17]:
gs_model.score(X_test, y_test)

0.33462423064180824