In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
df = pd.read_csv("train.csv")
df.head()

In [None]:
df.drop(columns = ['Name', 'PassengerId', 'Cabin', 'Ticket'], inplace = True)

In [None]:
df

In [None]:
# x = df.iloc[:, 1:].values
# y = df.iloc[:, 0].values

# x, y

In [None]:
# X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=.25, random_state=2)
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=['Survived']),
                                                    df['Survived'], test_size=.25, random_state=2)
X_train.head()

In [None]:
# transform the data using transformer

<h3>Imputation</h3>

In [254]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [None]:
# for age and Embarked

In [None]:
# imputation transformer

trf1 = ColumnTransformer([
    ('age_imputer', SimpleImputer(), [2]),
    ('embark_imputer', SimpleImputer(strategy="most_frequent"), [6])
], remainder="passthrough")

In [None]:
# one hot encoder

# trf2 = ColumnTransformer([
#     ('sex_embarked_ohe', OneHotEncoder(sparse=False, unknown_handle="ignore"), [1, 6])
# ], remainder="passthrough")

trf2 = ColumnTransformer([
    ('sex_embarked_ohe',OneHotEncoder(sparse_output=False, handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [258]:
# scaling

trf3 = ColumnTransformer([
    ('min-max-scaler', MinMaxScaler(), slice(0, 10))
])

In [260]:
# feature selection: optinal

trf4 = SelectKBest(score_func=chi2, k=8)

In [262]:
# applying machine learning classification

trf5 = DecisionTreeClassifier()

<h2>Pipeline</h2>

In [264]:
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [None]:
# with class

# pipe2 = Pipeline([
#     ('trf1', trf1),
#     ('trf2', trf2),
#     ('trf3', trf3),
#     ('trf4', trf4),
#     ('trf5', trf5)
# ])

In [266]:
pipe

In [None]:
X_train

In [290]:
pipe.fit(X_train, Y_train)

In [274]:
pipe.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0])

In [276]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('age_imputer', SimpleImputer(), [2]),
                                 ('embark_imputer',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('sex_embarked_ohe',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('min-max-scaler', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x1694bbba0>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

<h2>Deployment</h2>

In [None]:
import pickle

In [292]:
pickle.dump(pipe, open("pipeline.pkl", "wb"))

In [294]:
pipe_picked = pickle.load(open('pipeline.pkl', 'rb'))

In [302]:
y_pred = pipe_picked.predict(X_test)

In [304]:
from sklearn.metrics import accuracy_score

In [306]:
accuracy_score(Y_test, y_pred)

0.6457399103139013