In [86]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest , chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler


In [87]:
df = pd.read_csv('train.csv')

In [88]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [89]:
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
df.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S


In [90]:
X_train, X_test, y_train, y_test = train_test_split( df.drop(columns=['Age','Survived']), df['Survived'], test_size=0.2)


In [91]:
print(X_train.columns)
for i in range(len(X_train.columns)):
    print(i,',',end='')


Index(['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
0 ,1 ,2 ,3 ,4 ,5 ,

In [92]:
X_train.head(3)

# X_train.columns.get_loc('Embarked') 

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked
438,1,male,1,4,263.0,S
340,2,male,1,1,26.0,S
135,2,male,0,0,15.0458,C


In [93]:
# simple imputer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[5])
],remainder='passthrough')

In [94]:
# dft = trf1.fit_transform(X_train)

## note that after applying passthorugh it changes the coulmn 

In [95]:
# trf2.fit_transform(dft)
# pd.DataFrame(trf2.fit_transform(dft))
# dft[0:3]

In [96]:
# one hot encoding
# trf2 = ColumnTransformer([
#     ('ohe_sex',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6]),
# ],remainder='passthrough')
trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 3])  # OneHotEncode 'Sex' and 'Embarked'
], remainder='passthrough')


In [97]:
# scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10)),
],remainder='passthrough')


In [98]:
# feature selection
trf4 = SelectKBest(score_func=chi2,k=5)

In [99]:
# train the model
trf5 =  DecisionTreeClassifier()

In [100]:
# creating pipeline

In [101]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [102]:
# y_train.info()
pipe.fit(X_train,y_train)


In [46]:
print(X_train.info())


<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 254 to 68
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   SibSp     712 non-null    int64  
 3   Parch     712 non-null    int64  
 4   Fare      712 non-null    float64
 5   Embarked  710 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 38.9+ KB
None


In [107]:
y_pred = pipe.predict(X_test)


In [108]:
y_pred

array([0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [109]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8156424581005587