In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2,f_classif
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
df=pd.read_csv('https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/refs/heads/main/day29-sklearn-pipelines/train.csv')

In [3]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S
177,178,0,1,"Isham, Miss. Ann Elizabeth",female,50.0,0,0,PC 17595,28.7125,C49,C
683,684,0,3,"Goodwin, Mr. Charles Edward",male,14.0,5,2,CA 2144,46.9,,S
388,389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q
826,827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S


In [4]:
df.drop(['PassengerId','Name','Ticket','Cabin'],axis=1, inplace=True)

In [18]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('Survived',axis=1),df['Survived'],test_size=0.2,random_state=42)

In [20]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [7]:
X_train.sample(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
356,1,female,22.0,0,1,55.0,S
400,3,male,39.0,0,0,7.925,S


In [86]:
#Imputation Transformer
imputation=ColumnTransformer(transformers=[
    ('age_impite',SimpleImputer(missing_values=np.nan,strategy='mean'),[2]),
    ('embarked',SimpleImputer(missing_values=np.nan,strategy='most_frequent'),[6])
], remainder='passthrough')

In [96]:
# one hot encoding
ohe = ColumnTransformer(transformers=[
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,3])
],remainder='passthrough')

In [106]:
# Scaling
scale = ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,10))
])

In [11]:
#Feature Selection
feature_selection=SelectKBest(score_func=chi2, k=7)

In [226]:
clf=DecisionTreeClassifier()

In [302]:
pipe=Pipeline([
    ('imputation',imputation),
    ('One Hot Encode',ohe),
    ('scaler',scale),
    ('Chi2 Selection',feature_selection),
    ('DT_algo',clf)
])

In [304]:
pipe.fit(X_train,y_train)

In [306]:
## Explore the Transformer

In [308]:
pipe.named_steps['imputation'].transformers_[0][1].statistics_

array([29.49884615])

In [310]:
pipe.named_steps['imputation'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [312]:
pipe.named_steps['Chi2 Selection'].scores_

array([1.49581186e+01, 2.42391645e-02, 3.35279982e+00, 1.37061416e+02,
       7.19058821e+01, 1.52548748e-01, 1.88138506e+01, 5.03642441e-01,
       1.20106973e+00, 6.99702081e+00])

In [314]:
#Prediction
y_pred=pipe.predict(X_test)

In [316]:
from sklearn.metrics import accuracy_score

In [318]:
accuracy_score(y_test,y_pred)

0.7988826815642458

## Cross Validation Score

In [321]:
from sklearn.model_selection import cross_val_score

In [323]:
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

0.8020585048754063

## Grid Search CV

In [290]:
pipe.named_steps['DT algo']

In [326]:
# gridsearchcv
params = {
    'DT_algo__max_depth':[3,5,7,None]
}

In [328]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=pipe, param_grid=params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [334]:
grid.best_score_

0.8131882202304738

In [336]:
grid.best_params_

{'DT_algo__max_depth': 7}

## Exporting the Pipeline

In [338]:
import pickle

In [340]:
pickle.dump(pipe,open('pipe.pkl','wb'))