In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.tree import DecisionTreeClassifier

In [8]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df.columns

Index(['Unnamed: 0', 'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age',
       'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
df.drop(["PassengerId","Name","Ticket","Cabin"],axis=1, inplace=True)

In [12]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,0,3,male,22.0,1,0,7.25,S
1,1,1,1,female,38.0,1,0,71.2833,C


In [91]:
X = df.iloc[:,2:]
y = df.Survived

In [19]:
#Train Test Split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,2:],df.Survived,test_size=0.2,random_state=42)

In [111]:
X_test.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,male,,1,1,15.2458,C
439,2,male,31.0,0,0,10.5,S
840,3,male,20.0,0,0,7.925,S
720,2,female,6.0,0,1,33.0,S
39,3,female,14.0,1,0,11.2417,C


In [112]:
y_test.head(5)

709    1
439    0
840    0
720    1
39     1
Name: Survived, dtype: int64

## Creating Pipeline

In [73]:
# 1. Handling Missing Value Using Simple Imputer
trf1 = ColumnTransformer([
    ("si_age",SimpleImputer(),[2]),
    ("si_Embark",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

In [74]:
#2.Appling OneHotEncoder to Categorical Data
trf2 = ColumnTransformer([("ohe_emb",OneHotEncoder(handle_unknown='ignore',sparse=False),[1]),
                          ("ohe_sex",OneHotEncoder(handle_unknown='ignore',sparse=False),[3])], remainder="passthrough")

In [75]:
trf3 = ColumnTransformer([("min_max",MinMaxScaler(),slice(10))], remainder="passthrough")

In [76]:
trf4 = SelectKBest(score_func=chi2, k=8)

In [77]:
trf5 = DecisionTreeClassifier()

## Importing Pipeline

In [78]:
pipe = Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4),
    ("trf5",trf5)
])

-  Make Pipeline and Pipeline work on same bases.
- Pipeline has some advance feature for display then Make_Pipeline

In [81]:
from sklearn import set_config
set_config(display='diagram')

In [82]:
# Trainning data

pipe.fit(X_train,y_train)

In [80]:
y_pred = pipe.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,y_test)

array([[89, 21],
       [16, 53]], dtype=int64)

In [90]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.7932960893854749

## Explore Pipeline

In [83]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('si_age', SimpleImputer(), [2]),
                                 ('si_Embark',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_emb',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1]),
                                 ('ohe_sex',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [3])]),
 'trf3': ColumnTransformer(remainder='passthrough',
                   transformers=[('min_max', MinMaxScaler(),
                                  slice(None, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function c

In [85]:
pipe.named_steps["trf1"].transformers_

[('si_age', SimpleImputer(), [2]),
 ('si_Embark', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [89]:
pipe.named_steps["trf1"].transformers_[0][1].statistics_  # give mean of data

array([29.49884615])

### Using CrossVal Score for Better accuracy

In [93]:
from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(pipe,X,y,cv=10,scoring='accuracy'))

0.800287141073658

### GridSearchCV using Pipeline

In [None]:
DecisionTreeClassifier()

In [96]:
from sklearn.model_selection import GridSearchCV

parm = {"trf5__max_depth" : [2,3,4,5,6,7,8,9,10,None]}
grid = GridSearchCV(pipe,param_grid=parm,cv=5,scoring='accuracy')

grid.fit(X,y)

In [98]:
grid.best_params_

{'trf5__max_depth': 7}

In [99]:
grid.best_score_

0.8136840123030569

# Using Best Param to build new Pipeline

In [100]:
# 1. Handling Missing Value Using Simple Imputer
trf1 = ColumnTransformer([
    ("si_age",SimpleImputer(),[2]),
    ("si_Embark",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

In [101]:
#2.Appling OneHotEncoder to Categorical Data
trf2 = ColumnTransformer([("ohe_emb",OneHotEncoder(handle_unknown='ignore',sparse=False),[1]),
                          ("ohe_sex",OneHotEncoder(handle_unknown='ignore',sparse=False),[3])], remainder="passthrough")

In [102]:
# using MinMaxScaler because we will use SelectKbest
trf3 = ColumnTransformer([("min_max",MinMaxScaler(),slice(10))], remainder="passthrough")

In [103]:
# Using Kbest and scoring function chi2 score
trf4 = SelectKBest(score_func=chi2, k=8)

In [104]:
trf5 = DecisionTreeClassifier(max_depth=7)

In [105]:
# Creating Pipeline
pipe = Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4),
    ("trf5",trf5)
])

In [106]:
# Trainning Pipeline to Export
pipe.fit(X_train,y_train)

In [107]:
# Using pickle to Transform data
import pickle
pickle.dump(pipe,open("Model/pipeline.pkl","wb"))