<a href="https://colab.research.google.com/github/rocket0l4/About-ML-Pipelines/blob/main/ML_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2


In [2]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

In [3]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
190,191,1,2,"Pinsky, Mrs. (Rosa)",female,32.0,0,0,234604,13.0,,S
562,563,0,2,"Norman, Mr. Robert Douglas",male,28.0,0,0,218629,13.5,,S
656,657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S
768,769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q
838,839,1,3,"Chip, Mr. Chang",male,32.0,0,0,1601,56.4958,,S


In [4]:
df.drop(columns= ['Name','PassengerId','Ticket','Cabin'],inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
# Train/Test/Split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                 random_state=42)

In [7]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [8]:
y_train.sample(5)

Unnamed: 0,Survived
693,0
184,1
414,1
676,0
203,0


In [None]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


#Filling missing values present in the dataset by Imputation transformer.

- Here we using the index values insted of using the directly name.
For example :
              - age we use the index 2.
              - embarked we use the index 6
              as we can see in the table for others indexing.

In [12]:
#Appliying imputation transformer

trf1 = ColumnTransformer([
    ('age_impute',SimpleImputer(),[2]),
    ('embarked_impute',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

# After applying imputaton transformer the index of age and embarked will be changed.

In [51]:
# OneHotEncoding Sex and Embarked

trf2 = ColumnTransformer([
    ('sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,3])
],remainder='passthrough')

In [52]:
#Scaling

trf3 = ColumnTransformer([
    ('scale',StandardScaler(),slice(0,10))
])

In [53]:
#train the model
trf4 = DecisionTreeClassifier()

#Creating Pipeline

In [54]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])

#make_pipeline Vs Pipeline

- Pipeline requires nameing of steps, make_pipeline does not.

- Same applies to the ColumnTransformer Vs make_column_transformer.

            For ColmunTransformer we need to pass three attribtes('name',process(),[index]).

            For make_column_transformer,we need to pass only two attributes(process(),[index]).

In [None]:
# Alternate Syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4)

In [55]:
#train
pipe.fit(X_train,y_train)



#Explor the Pipeline

- This code is use for backtrack and debugging.

In [56]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [57]:

from sklearn import set_config
set_config(display='diagram')

In [58]:
# Predict
y_pred = pipe.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1])

In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.776536312849162

# GridSearch using Pipeline

In [65]:
# gridsearchcv
params = {
    'trf4__max_depth':[1,2,3,4,5,None]
}

In [66]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=4,scoring='accuracy')
grid.fit(X_train,y_train)



In [67]:
grid.best_score_

0.824438202247191

In [68]:
grid.best_params_

{'trf4__max_depth': 3}

# Exporting the Pipeline
- for using it in production, we use it..
- dupming the file in pipe.pkl and use it in the production code.


In [69]:
#export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

#Production file

- Production code will remain same, there will be no change required.
- If there will be change happend, then only happens in the above pipe.pkl file...

In [70]:
pipe = pickle.load(open('pipe.pkl','rb'))

In [72]:
# Assume user input
test_input2 = np.array([3,'male',31.0,0,0,10.5,'S'],dtype = object).reshape(1,7)

In [73]:
pipe.predict(test_input2)



array([1])