In [35]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score



In [3]:
df = pd.read_csv("train.csv")

In [5]:
df.drop(["PassengerId","Name","Ticket","Cabin"],axis = 1,inplace = True)

In [7]:
#Step 1 train_test_split
X_train, X_test,y_train,y_test = train_test_split(df.drop("Survived", axis = 1),df["Survived"],test_size = 0.2, random_state=42)

In [11]:
# Imputation transformer
trf1 = ColumnTransformer([
    ("impute_age",SimpleImputer(),[2]),
    ("impute_embarked", SimpleImputer(strategy ="most_frequent"),[6])
], remainder = "passthrough")

In [12]:
trf2 = ColumnTransformer([
    ("ohe_sex_embarked", OneHotEncoder(sparse_output = False, handle_unknown = "ignore"),[1,6])
], remainder = "passthrough")

In [18]:
#Scaling
trf3 = ColumnTransformer([
    ("scale", MinMaxScaler(),slice(0,10))
])

In [21]:
# Feature Selection 

trf4 = SelectKBest(score_func =chi2, k =8)

In [22]:
# model_training
trf5 = DecisionTreeClassifier()

## CREATE PIPELINE

In [23]:
pipe = Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4),
    ("trf5",trf5)    
])

In [24]:
#make_pipeline
# #Alternate_syntax
# pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [25]:
# train
pipe.fit(X_train, y_train)

In [26]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x00000245CD5DFE20>),
 'trf5': DecisionTreeClassifier()}

In [32]:
pipe.named_steps["trf1"].transformers_[0][1].statistics_

array([29.49884615])

In [33]:
pipe.named_steps["trf1"].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [34]:
# Prediction
y_pred = pipe.predict(X_test)
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [36]:
print("accuracy: ", accuracy_score(y_test,y_pred))

accuracy:  0.6256983240223464


## Cross Validation Using Pipeline


In [37]:
from sklearn.model_selection import cross_val_score
print("cross_val_score:",  cross_val_score(pipe,X_train, y_train, cv= 5, scoring ="accuracy").mean())

cross_val_score: 0.6391214419383433


## Grid search using Pipeline

In [38]:
## grid_search
params = {
    "trf5__max_depth":[1,2,3,4,5,None]
}

In [39]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv = 5, scoring = "accuracy")
grid.fit(X_train,y_train)

In [40]:
grid.best_score_

0.6391214419383433

In [41]:
grid.best_params_

{'trf5__max_depth': 2}

## Exporting the pipeline

In [43]:
#Export 
import pickle
pickle.dump(pipe, open("pipe.pkl","wb"))