<a href="https://colab.research.google.com/github/rajjjxd/ML-Tutorials/blob/main/titanic_with_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier


In [None]:
df = pd.read_csv('train.csv')

In [None]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [None]:
X_train.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,332,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,734,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S


In [None]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[4]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[10])
],remainder='passthrough')

In [None]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[3,10])
],remainder='passthrough')

In [None]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,14))
])

In [None]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)

In [None]:
# train the model
trf5 = DecisionTreeClassifier()

**CREATING THE PIPELINE**

In [None]:
# Create the pipeline
pipe = Pipeline([
    ('trf1', trf1),  # First preprocessing step
    ('trf2', trf2),  # Second preprocessing step
    ('trf3', trf3),  # Third preprocessing step
    ('trf4', trf4),  # Fourth preprocessing step
    ('trf5', trf5)   # Final step (e.g., model)
])

# Fit the pipeline on your training data
pipe.fit(X_train, y_train)






In [None]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [4]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [10])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [3, 10])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 14, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x7db3d54e7370>),
 'trf5': DecisionTreeClassifier()}

In [None]:
# Predict
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7039106145251397