# <center>ML Pipeline</center>
---

# Titanic with Pipelines

In [99]:
# data wrangling
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
import sklearn.tree as tree
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import sklearn.feature_selection as feature_selection
import sklearn.compose as compose
import sklearn.pipeline as pipeline
import sklearn.impute as impute
import pickle

In [15]:
data = pd.read_csv("data/train.csv")

In [16]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
data.shape

(891, 12)

In [18]:
# dropping useless features
data = data.drop(["PassengerId","Ticket","Name","Cabin"], axis = 1)

In [19]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

## Pipeline Workflow
1. **ColumnTransformer1:** Impute missing values in Age and Embarked features using one column transformer
2. **ColumnTransformer2:** Put ColumnTransformer1 as an input to the ColumnTransformer2 where we will then do OneHotEncoding of the feature **Sex** and **Embarked**
3. **ColumnTransformer3:** Put ColumnTransformer2 as an input to the ColumnTransformer3 where we will then do **Feature Scaling**
4. **ColumnTransformer4:** Put ColumnTransformer3 as an input to the ColumnTransformer4 where we will then do **Feature Selection**
5. **ColumnTransformer5:** Put ColumnTransformer4 as an input to the ColumnTransformer5 where we will train the model with **DecisionTree**.

**Note:** When using ColumnTransformer if you want to refer columns then do it using their index becuase it works even with numpy array and as a matter of fact after transforming they probably becomes numpy array

### Train Test Split

In [20]:
X = data.drop("Survived", axis = 1)
y = data["Survived"]

In [21]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2)

In [22]:
X.shape, y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape

((891, 7), (891,), (712, 7), (179, 7), (712,), (179,))

### ColumnTransformer1
 Impute missing values in **Age** and **Embarked** features using SimpleImputer.
 
 
 **Note:** When using ColumnTransformer if you want to refer columns then do it using their index becuase it works even with numpy array and as a matter of fact after transforming they probably becomes numpy array

In [23]:
display(X.columns)

# index of sex
display(X.columns[2])

# index of Embarked
display(X.columns[6])

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

'Age'

'Embarked'

#### Imputing Sex & Embarked with SimpleImputer

`remainder = "passthrough"`: ensures that the rest of the features are kept without any modification, if not then they would be dropped.

In [24]:
# ColumnTransformer takes in list of tuples
col_transformer1 = compose.ColumnTransformer([
    ("impute_age", impute.SimpleImputer(), [2]),
    ("imputr_embarked", impute.SimpleImputer(strategy="most_frequent"), [6])
],remainder='passthrough')

### ColumnTransformer2
OneHotEncoding of the feature **Sex** and **Embarked**

In [25]:
# one hot encoding two features at once
col_transformer2 = compose.ColumnTransformer([
    ("one_hot_enc_sex_embarked",preprocessing.OneHotEncoder(sparse_output=False,handle_unknown="ignore"), [1,6])
], remainder="passthrough")

### ColumnTransformer3

#### Feature Scaling

In [26]:
# applying on every column, hence not using remainder="passthrough"
col_transformer3 = compose.ColumnTransformer([("scale", preprocessing.MinMaxScaler(), slice(0,10))])

### ColumnTransformer4

#### Feature Selection

In [27]:
transformer4 = feature_selection.SelectKBest(score_func=feature_selection.chi2, k = 8)

### ColumnTransformer5

#### Model Training

In [55]:
transformer5 = tree.DecisionTreeClassifier()

### Create Pipeline

In [78]:
# pipeline object
pipe = pipeline.Pipeline([
    ("col_transformer1", col_transformer1),
    ("col_transformer2",col_transformer2),
    ("col_transformer3",col_transformer3),
    ("transformer4",transformer4),
    ("transformer5",transformer5)]
)

#### Pipeline vs make_pipeline
Pipeline requires naming of steps, make_pipeline do not

(Same goes to ColumnTransformer and make_column_transformer)

In [79]:
# alternate syntax
# pipe = pipeline.make_pipeline(col_transformer1, col_transformer2, col_transformer3,transformer4,transformer5)

### Train with pipelines
**Note:** 
* Since, we are using the ml model (DecisionTree) inside the pipeline we are calling .fit() to preprocess & train and to predict we would call .predict().

* If we used the pipeline only for preprocessing then would only use the .fit_transform() to preprocess and transform at the same time.

In [106]:
pipe.fit(X_train.values, y_train.values)

### Explore Pipeline

In [107]:
# see the steps
pipe.named_steps

{'col_transformer1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('imputr_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'col_transformer2': ColumnTransformer(remainder='passthrough',
                   transformers=[('one_hot_enc_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'col_transformer3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'transformer4': SelectKBest(k=8, score_func=<function chi2 at 0x125257380>),
 'transformer5': DecisionTreeClassifier()}

In [108]:
# access any transformer
pipe.named_steps["col_transformer1"]

In [109]:
# access inside that transformer
pipe.named_steps["col_transformer1"].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('imputr_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [110]:
# explore impute_age
pipe.named_steps["col_transformer1"].transformers_[0]

('impute_age', SimpleImputer(), [2])

In [111]:
pipe.named_steps["col_transformer1"].transformers_[0][1]

In [112]:
# acess the mean value used for imputation
pipe.named_steps["col_transformer1"].transformers_[0][1].statistics_

array([29.3832])

In [113]:
# acess the most_frequent value used for imputation
pipe.named_steps["col_transformer1"].transformers_[1][1].statistics_

array(['S'], dtype=object)

#### predict

In [116]:
y_pred = pipe.predict(X_test.values)

pipe.score(X_test.values,y_test.values)

0.6145251396648045

### Cross Validation using Pipeline

In [117]:
model_selection.cross_val_score(pipe,X_train, y_train, cv=5, scoring="accuracy").mean()

0.641918644735546

### GridSearchCV using Pipeline

In [118]:
# here the name "transformer5" represents the transformer5 in pipeline where we used the ml model (note it)
params = {
    "transformer5__max_depth":[1,2,3,4,5]
}

In [119]:
model_selection.GridSearchCV(estimator=pipe, param_grid=params, cv = 5, scoring="accuracy")

In [121]:
grid = model_selection.GridSearchCV(estimator=pipe, param_grid=params, cv = 5, scoring="accuracy")
grid.fit(X_train.values,y_train.values)

grid.best_score_

0.641918644735546

In [122]:
grid.best_params_

{'transformer5__max_depth': 1}

### Export Pipeline

In [123]:
pickle.dump(pipe, open("pipe.pkl","wb"))

---
### How to use the Pipeline in production

In [124]:
import numpy as np
import pandas as pd
import pickle

In [125]:
pipe = pickle.load(open("pipe.pkl","rb"))

In [126]:
pipe

In [127]:
# Assume user input
input1 = np.array([2, "male", 31, 0, 0, 10.5, "S" ]).reshape(1,7)

In [128]:
# predict
pipe.predict(input1)

array([0])