In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline

excel_file_path = "../EDA/Titanic_EDA/train.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# preparing training and test data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.drop(["Survived", "Cabin", "PassengerId", "Name", "Ticket"], axis=1),
    df["Survived"],
    test_size=0.3,
)
x_train.shape, x_test.shape

((623, 7), (268, 7))

In [3]:
x_train.isnull().sum()

Pclass        0
Sex           0
Age         128
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [4]:
sorted(df["Pclass"].unique())

[1, 2, 3]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

trf1 = ColumnTransformer(
    [
        ("tnf1", SimpleImputer(), [2]),
        ("tnf2", SimpleImputer(strategy="most_frequent"), [6]),
    ],
    remainder="passthrough",
    verbose=True,
)

In [6]:
trf3 = ColumnTransformer(
    [("tnf3", OneHotEncoder(sparse=False, handle_unknown="ignore"), [1, 6])],
    remainder="passthrough",
    verbose=True,
)

trf4 = ColumnTransformer(
    [
        (
            "tnf4",
            OrdinalEncoder(categories=[sorted(df["Pclass"].unique()) + [0.0]]),
            [0],
        ),
    ],
    remainder="passthrough",
    verbose=True,
)

In [7]:
trf5 = ColumnTransformer([("scale", MinMaxScaler(), slice(0, 10))])
trf6 = SelectKBest(score_func=chi2, k=8)
trf7 = DecisionTreeClassifier()

In [8]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    [
        ("tnf1", trf1),
        ("tnf3", trf3),
        ("tnf4", trf4),
        ("tnf5", trf5),
        ("tnf6", trf6),
        ("tnf7", trf7),
    ]
)
pipe.fit(x_train, y_train)

[ColumnTransformer] .......... (1 of 3) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 3) Processing tnf2, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf3, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf4, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s




In [9]:
pipe.named_steps["tnf1"].transformers_[0][1].statistics_
pipe.named_steps["tnf1"].transformers_[1][1].statistics_
# pipe.named_steps['tnf7'].transformers_.decision_path

array(['S'], dtype=object)

In [10]:
from sklearn.metrics import accuracy_score

y_pred = pipe.predict(x_test)
accuracy_score(y_test, y_pred)

0.6380597014925373

In [11]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, x_train, y_train, cv=5, scoring="accuracy").mean()

[ColumnTransformer] .......... (1 of 3) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 3) Processing tnf2, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf3, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf4, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 3) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 3) Processing tnf2, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf3, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf4, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s




[ColumnTransformer] .......... (1 of 3) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 3) Processing tnf2, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf3, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf4, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 3) Processing tnf1, total=   0.0s
[ColumnTransformer] .......... (2 of 3) Processing tnf2, total=   0.0s
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf3, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[ColumnTransformer] .......... (1 of 2) Processing tnf4, total=   0.0s
[ColumnTransformer] ..... (2 of 2) Processing remainder, total=   0.0s
[Colum



0.6339354838709677

In [12]:
import pickle
import numpy as np

pickle.dump(pipe, open("pipe.pkl", "wb"))
pipe = pickle.load(open("pipe.pkl", "rb"))
test_input = np.array([2, "female", 2, 0, 0, 10.5, "C"], dtype=object).reshape(1, 7)
pipe.predict(test_input)



array([1], dtype=int64)