# With Pipeline

In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

#### Layout of pipeline 
1. Impute Missinng values, using column transformer
2. We will send the above thing in another column transformer which will do <br>
OHE on sex and embarked.
3. Scaling using Column Transformers
4. feature selection(best 5 features.)
5. training model using DT

In [54]:
df = pd.read_csv("E://Datasets/titanic.csv")
df.drop(columns=['PassengerId','Name','Ticket',"Cabin"],inplace=True)
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [55]:
## splitting 
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [56]:
## Imputation Transformer
## while creating a pipeline make sure you are passing the column by its index not by name.
## It will work better if we pass it by index.
trf1 = ColumnTransformer([
    ("impute_age",SimpleImputer(),[2]),
    ("impute_embarked",SimpleImputer(strategy="most_frequent"),[6])
],remainder='passthrough')

In [57]:
## OHE 
trf2 = ColumnTransformer([
    ("ohe_sex_embarked",OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [58]:
## Scaling
## after OHE, there are now 10 columns, slice(0,10), means it will scale every single column in that range.

trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [59]:
## Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)

In [60]:
## Train the model
trf5 = DecisionTreeClassifier()

# Create PIPELINE

In [61]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

OR ELSE WE CAN USE make_pipline too<br>
pipe = make_pipeline(trf1,trf2,trf3.trf4,trf5)

In [62]:
# train

## If we are not training a model in our pipeline then we will fit_transform.
pipe.fit(x_train,y_train)

### Some attributes.

In [63]:
## To see the steps.
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000002D997ABFC10>),
 'trf5': DecisionTreeClassifier()}

In [64]:
## to see simpleimputer and its calculated mean
## the above is just a dictionary, so we can slice indexes our way through.
## statistics_ gives us the calculated mean
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.49884615])

In [65]:
## we can use pipe.predict function because we used DT at the end of the PIPELINE.
from sklearn.metrics import accuracy_score
y_pred = pipe.predict(x_test)
accuracy_score(y_test,y_pred)

0.6256983240223464

# Cross Validation using PIPELINE

In [66]:
from sklearn.model_selection import cross_val_score

In [67]:
cross_val_score(pipe, x_train,y_train,cv=6,scoring="accuracy").mean()

0.6390471442814414

# Grid Search using PIPELINE

In [70]:
from sklearn.model_selection import GridSearchCV
params={
    "trf5__max_depth":[1,2,3,4,5,None]
}
grid = GridSearchCV(pipe,params,cv=6,scoring='accuracy')
grid.fit(x_train,y_train)
grid.best_score_

0.6390471442814414

# Exporting Pipeline
In this the main advantage is :
1. we dont have to save the models seperately for OHE, simple imputer and scaling.
2. If we want to update the code, just export and import this model. you dont have to change the main code. jst save the file. and import the file.

In [71]:
import pickle
pickle.dump(pipe,open('models/pipe.pkl',"wb"))