In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,Sibsp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0


In [4]:
df.shape

(1309, 11)

In [5]:
df.drop(columns=['Name','Ticket','Cabin'],inplace=True)

In [6]:
df.sample(5)

Unnamed: 0,Pclass,Sex,Age,Sibsp,Parch,Fare,Embarked,Survived
852,3,female,,0,0,7.55,S,0
545,2,female,30.0,3,0,21.0,S,1
187,1,female,16.0,0,1,39.4,S,1
874,3,male,,0,0,7.8875,S,1
228,1,male,18.0,1,0,108.9,C,0


In [7]:
df.isnull().sum()

Pclass        0
Sex           0
Age         263
Sibsp         0
Parch         0
Fare          1
Embarked      2
Survived      0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
Sibsp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [10]:
df.shape

(1043, 8)

In [11]:
df['Age'] = df['Age'].astype(int)

In [12]:
df['Fare'] = df['Fare'].round(2)

In [13]:
df.sample(5)

Unnamed: 0,Pclass,Sex,Age,Sibsp,Parch,Fare,Embarked,Survived
183,1,male,35,0,0,512.33,C,1
1254,3,male,25,0,0,0.0,S,1
429,2,male,44,0,0,13.0,S,0
875,3,male,30,0,0,7.23,C,0
610,3,female,40,1,0,9.48,S,0


In [14]:
df['Embarked'].value_counts()

S    781
C    212
Q     50
Name: Embarked, dtype: int64

In [15]:
X = df.iloc[:,:7]
y = df.iloc[:,-1]

In [16]:
X

Unnamed: 0,Pclass,Sex,Age,Sibsp,Parch,Fare,Embarked
0,1,female,29,0,0,211.34,S
1,1,male,0,1,2,151.55,S
2,1,female,2,1,2,151.55,S
3,1,male,30,1,2,151.55,S
4,1,female,25,1,2,151.55,S
...,...,...,...,...,...,...,...
1301,3,male,45,0,0,7.22,C
1304,3,female,14,1,0,14.45,C
1306,3,male,26,0,0,7.22,C
1307,3,male,27,0,0,7.22,C


In [17]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1301    0
1304    0
1306    0
1307    0
1308    0
Name: Survived, Length: 1043, dtype: int64

In [18]:
# train-test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [19]:
X_train

Unnamed: 0,Pclass,Sex,Age,Sibsp,Parch,Fare,Embarked
13,1,female,26,0,0,78.85,S
1024,3,male,25,0,0,7.65,S
809,3,male,18,2,2,34.38,S
468,2,female,22,0,0,21.00,S
883,3,male,20,0,0,7.85,S
...,...,...,...,...,...,...,...
507,2,male,27,0,0,13.00,S
404,2,male,21,0,0,13.00,S
777,3,male,19,0,0,8.05,S
488,2,male,50,1,0,26.00,S


In [20]:
y_train

13      1
1024    0
809     0
468     0
883     0
       ..
507     0
404     0
777     1
488     0
199     1
Name: Survived, Length: 834, dtype: int64

# Creating Pipeline

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [22]:
# step 1 -> One hot encoding

trf1 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse=False, handle_unknown='ignore'),[1,6])
     ],remainder='passthrough')

In [23]:
# step 2 -> Scaling

trf2 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
     ])

In [24]:
# step 3 -> Feature selection

trf3 = SelectKBest(score_func=chi2, k=8)

In [25]:
# step 4 -> Train the model

trf4 = DecisionTreeClassifier()

In [26]:
# Joining the steps 

pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
     ])

In [27]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [28]:
# Fitting 

pipe.fit(X_train,y_train)

In [29]:
# Predict

y_pred = pipe.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [30]:
# Accuracy of model

from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.7894736842105263

In [31]:
# cross validation using cross_val_score

from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.7794314984488854

# GridSearchCV

In [32]:
# gridsearchcv

from sklearn.model_selection import GridSearchCV

params = {
    'trf4__max_depth':[1,2,3,4,5,None]
}

grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')

grid.fit(X_train, y_train)

In [33]:
grid.best_score_

0.7818194935430344

In [34]:
grid.best_params_

{'trf4__max_depth': 2}