 - Pipelines chains together multiple steps so that the output of each step can be used as an input to the next step.

 - Pipelines make it easy to apply the same preprocessing to train and test.

We will work on Titanic dataset:
    
A classification problem to predict whether person survived or not

# Titanic without using pipelines

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("/Users/rajatchauhan/Desktop/Machine Learning Notes/Datasets/titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We will import some libraries do some preprocessing

1. First to split the data into train and test

In [4]:
from sklearn.model_selection import train_test_split

2. To handle missing data

In [5]:
from sklearn.impute import SimpleImputer

3. To handle nominal categorical data

In [6]:
from sklearn.preprocessing import OneHotEncoder

4. For scaling the data

In [7]:
from sklearn.preprocessing import MinMaxScaler

5. Decision Tree Classifier for prediction

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [9]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


 - Dropping unnecessary columns

In [10]:
df.drop(columns = ["PassengerId", "Name", "Ticket", "Cabin"], inplace= True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [11]:
X = df.drop(columns = "Survived")
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [12]:
y = df["Survived"]
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

# Step- 1 Train-Test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [14]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [15]:
y_train

331    0
733    0
382    0
704    0
813    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: Survived, Length: 712, dtype: int64

# Step- 2 Handling missing values

In [16]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

For these two columns Age and Embarked, we will need to replace null values using Simple Imputer

1. Age: Missing values in Age column we can replace by mean

In [17]:
si_age = SimpleImputer(strategy='mean')

In [18]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [19]:
X_test.isnull().sum()

Pclass       0
Sex          0
Age         37
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64

First, we fit and transform on training data and then use same transformation for test data

In [20]:
X_train_age = si_age.fit_transform(X_train[["Age"]])
X_test_age = si_age.transform(X_test[["Age"]])

In [21]:
X_train_age

array([[45.5       ],
       [23.        ],
       [32.        ],
       [26.        ],
       [ 6.        ],
       [24.        ],
       [45.        ],
       [29.        ],
       [29.49884615],
       [29.49884615],
       [42.        ],
       [36.        ],
       [33.        ],
       [17.        ],
       [29.        ],
       [50.        ],
       [35.        ],
       [38.        ],
       [34.        ],
       [17.        ],
       [11.        ],
       [61.        ],
       [30.        ],
       [ 7.        ],
       [63.        ],
       [20.        ],
       [29.49884615],
       [29.        ],
       [36.        ],
       [29.49884615],
       [50.        ],
       [27.        ],
       [30.        ],
       [33.        ],
       [29.49884615],
       [29.49884615],
       [ 2.        ],
       [25.        ],
       [51.        ],
       [25.        ],
       [29.49884615],
       [29.49884615],
       [24.        ],
       [18.        ],
       [29.49884615],
       [25

In [22]:
X_test_age

array([[29.49884615],
       [31.        ],
       [20.        ],
       [ 6.        ],
       [14.        ],
       [26.        ],
       [29.49884615],
       [16.        ],
       [16.        ],
       [19.        ],
       [37.        ],
       [44.        ],
       [29.49884615],
       [30.        ],
       [36.        ],
       [16.        ],
       [42.        ],
       [29.49884615],
       [27.        ],
       [47.        ],
       [24.        ],
       [34.        ],
       [19.        ],
       [20.        ],
       [29.49884615],
       [10.        ],
       [40.        ],
       [31.        ],
       [ 4.        ],
       [31.        ],
       [19.        ],
       [22.        ],
       [29.49884615],
       [29.49884615],
       [18.        ],
       [27.        ],
       [28.        ],
       [29.49884615],
       [30.        ],
       [29.49884615],
       [21.        ],
       [29.        ],
       [29.49884615],
       [29.49884615],
       [45.        ],
       [16

So, we can see missing values are being replaced by mean in both Train and Test data

2. Embarked: Embarked being a categorical column, we can replace missing value by most frequent category

In [23]:
si_emb = SimpleImputer(strategy="most_frequent")

In [24]:
X_train_emb = si_emb.fit_transform(X_train[["Embarked"]])
X_test_emb = si_emb.transform(X_test[["Embarked"]])

In [25]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [36]:
X_train["Sex"].unique()

array(['male', 'female'], dtype=object)

In [37]:
X_train["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

# Step - 3 One Hot Encoding on Nominal Features Sex and Embarked

As, Machine learning models mostly only work with numerical dataset, we need to apply Encoding to these categorical columns

In [26]:
ohe_sex = OneHotEncoder(sparse=False, handle_unknown= "ignore")
ohe_emb = OneHotEncoder(sparse=False, handle_unknown= "ignore")

In [27]:
X_train_sex = ohe_sex.fit_transform(X_train[["Sex"]])
X_test_sex = ohe_sex.transform(X_test[["Sex"]])

In [30]:
X_train_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

This is array of size 2, as we had 2 categories in Sex: male and female

In [28]:
X_train_emb = ohe_emb.fit_transform(X_train_emb)
X_test_emb = ohe_emb.transform(X_test_emb)

In [35]:
X_test_emb

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0

Because, we had three categories in Embarked

## Step 4. Combining the arrays formed and making the complete dataset

Firstly, making a dataframe of remaining columns, that were not modified in the preprocessing and encoding steps

In [38]:
X_train_rem = X_train.drop(columns = ["Sex", "Age", "Embarked"])
X_train_rem

Unnamed: 0,Pclass,SibSp,Parch,Fare
331,1,0,0,28.5000
733,2,0,0,13.0000
382,3,0,0,7.9250
704,3,1,0,7.8542
813,3,4,2,31.2750
...,...,...,...,...
106,3,0,0,7.6500
270,1,0,0,31.0000
860,3,2,0,14.1083
435,1,1,2,120.0000


In [39]:
X_test_rem = X_test.drop(columns = ["Sex", "Age", "Embarked"])
X_test_rem

Unnamed: 0,Pclass,SibSp,Parch,Fare
709,3,1,1,15.2458
439,2,0,0,10.5000
840,3,0,0,7.9250
720,2,0,1,33.0000
39,3,1,0,11.2417
...,...,...,...,...
433,3,0,0,7.1250
773,3,0,0,7.2250
25,3,1,5,31.3875
84,2,0,0,10.5000


In [40]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex, X_train_emb), axis = 1)
X_train_transformed

array([[1., 0., 0., ..., 0., 0., 1.],
       [2., 0., 0., ..., 0., 0., 1.],
       [3., 0., 0., ..., 0., 0., 1.],
       ...,
       [3., 2., 0., ..., 0., 0., 1.],
       [1., 1., 2., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [43]:
X_train_transformed.shape

(712, 10)

In [41]:
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex, X_test_emb), axis = 1)
X_test_transformed

array([[3., 1., 1., ..., 1., 0., 0.],
       [2., 0., 0., ..., 0., 0., 1.],
       [3., 0., 0., ..., 0., 0., 1.],
       ...,
       [3., 1., 5., ..., 0., 0., 1.],
       [2., 0., 0., ..., 0., 0., 1.],
       [3., 1., 1., ..., 0., 0., 1.]])

In [44]:
X_test_transformed.shape

(179, 10)

## Step 5. Training Decision Tree Model on this dataset

Now, that all the preprocessing is done, we can apply a ML algorithm on it

In [45]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier()

In [46]:
y_pred = clf.predict(X_test_transformed)

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(y_test, y_pred)

0.7821229050279329

# Step 6. Exporting Model to a Web Development

pickle is a Python module that is used for serializing and deserializing Python objects. 

Serialization is the process of converting a Python object into a byte stream, and deserialization is the process of converting that byte stream back into a Python object.

In [49]:
import pickle

Exporting our objects, or encoding our objects

In [51]:
pickle.dump(ohe_sex,open("models/ohe_sex.pkl", "wb"))
pickle.dump(ohe_emb,open("models/ohe_emb.pkl", "wb"))
pickle.dump(clf,open("models/clf.pkl", "wb"))

## Step 7. Using Pickle Files

In [53]:
import pickle
import numpy as np

Decoding the pickle objects

In [55]:
ohe_sex = pickle.load(open("models/ohe_sex.pkl", "rb"))
ohe_emb = pickle.load(open("models/ohe_emb.pkl", "rb"))
clf = pickle.load(open("models/clf.pkl", "rb"))

In [56]:
test_input = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [57]:
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [58]:
test_input_sex = ohe_sex.transform(test_input[:,1].reshape(1,1))



In [59]:
test_input_sex

array([[0., 1.]])

In [61]:
test_input_emb = ohe_emb.transform(test_input[:,-1].reshape(1,1))

In [62]:
test_input_emb

array([[0., 0., 1.]])

In [63]:
test_input_age = test_input[:,2].reshape(1,1)
test_input_age

array([[31.0]], dtype=object)

In [65]:
test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]],test_input_age,test_input_sex,test_input_emb),axis=1)

In [66]:
test_input_transformed

array([[2, 0, 0, 10.5, 31.0, 0.0, 1.0, 0.0, 0.0, 1.0]], dtype=object)

In [67]:
test_input_transformed.shape

(1, 10)

In [68]:
clf.predict(test_input_transformed)

array([0])

# Titanic using Pipeline

In [69]:
df = pd.read_csv("/Users/rajatchauhan/Desktop/Machine Learning Notes/Datasets/titanic.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [70]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

# train/test/split

In [71]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

# 1. Imputation transformer

In [72]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


We can use SK learn Column Transformer

In [74]:
from sklearn.compose import ColumnTransformer

In [75]:
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

# 2. One hot encoding

In [77]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

# 3. Scaling

In [78]:
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

# 4. Feature selection

In [80]:
from sklearn.feature_selection import SelectKBest,chi2

In [81]:
trf4 = SelectKBest(score_func=chi2,k=8)

# 5. train the model

In [82]:
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [84]:
from sklearn.pipeline import Pipeline,make_pipeline

In [102]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

# Pipeline Vs make_pipeline
Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [103]:
# Alternate Syntax
# pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [104]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [105]:
# train
pipe.fit(X_train,y_train)

In [106]:
# Predict
y_pred = pipe.predict(X_test)

In [107]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

# Cross Validation using Pipeline

In [108]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.6391214419383433

# GridSearch using Pipeline

In [109]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [110]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [111]:
grid.best_score_

0.6391214419383433

In [112]:
grid.best_params_

{'trf5__max_depth': 2}

# Exporting the Pipeline

In [116]:
import pickle
pickle.dump(pipe,open('models/pipe.pkl','wb'))

Using this object to do the predictions now

In [117]:
pipe = pickle.load(open('models/pipe.pkl', 'rb'))
pipe

In [118]:
test_input2 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [119]:
pipe.predict(test_input2)



array([0])

So, we can directly use Pipe and all the preprocessing and everything that we have done will be done automatically