# Processing with pipeline 

## Pipeline Vs make_pipeline:
    Pipeline requires naming of steps, make_pipeline does not requires naming of steps.
    
    (Same applies thing for ColumnTransformer vs make_column_transformer)
    

In [1]:
# Processing with pipeline 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

from sklearn import set_config

import pickle

def read_csv(file_name):
    data = pd.read_csv(file_name)
    data.drop(columns=['PassengerId', 'Name','Ticket', 'Cabin'], inplace=True)
    return data

def split_data(data):
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test =  train_test_split(data.drop(columns=['Survived']),
                                                        data['Survived'],
                                                        test_size=0.3,
                                                        random_state=1)
    return X_train,X_test,Y_train,Y_test


# Applying Simple Imputer on Age and Embarked column
def transform_data(X_train,X_test):
    # Applying Simple Imputer
    t1 = ColumnTransformer([('si_age',SimpleImputer(),[2]),
                              ('si_embarked',SimpleImputer(strategy='most_frequent'),[6])
                             ],remainder='passthrough')
    #remainder='passthrough: means that all columns that are not specified in the list of transformers will be passed through without transformation, instead of being dropped.
    
    # Applying One hot encoding
    t2 = ColumnTransformer([('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
                             ],remainder='passthrough')
    # handle_unknown='ignore': means that if a column contains an unknown category, it will be ignored during the transformation process.
    
    # Applying  Scaling
    t3 = ColumnTransformer([('scale',MinMaxScaler(),slice(0,10))
                             ])
    # selecting features
    t4 = SelectKBest(score_func=chi2,k=8)
    # train the model
    t5 = DecisionTreeClassifier()

    #Create Pipeline
    pipe = Pipeline([('t1',t1),
                     ('t2',t2),
                     ('t3',t3),
                     ('t4',t4),
                     ('t5',t5)
                    ])
    #other way or Alternate Syntax
    #pipe = make_pipeline(t1,t2,t3,t4,t5)
    
    # train
    print(pipe.fit(X_train,Y_train))
    print("********************************************************************************************************")
    
    # Code here to see the steps
    print(pipe.named_steps['t1'])
    print(pipe.named_steps['t1'].transformers_[0][1].statistics_)
    print("********************************************************************************************************")
    
    # Code here to see all the steps
    print(pipe.named_steps)
    print("********************************************************************************************************")
    
    
    # Predict the result
    Y_pred = pipe.predict(X_test)
    
    from sklearn.metrics import accuracy_score
    print("accuraccy : ",accuracy_score(Y_test,Y_pred))
    
    # doing Cross Validation by using Pipeline
    # cross validation using cross_val_score
    from sklearn.model_selection import cross_val_score
    cross_val_score(pipe, X_train, Y_train, cv=5, scoring='accuracy').mean()
    
    # GridSearch using Pipeline
    # gridsearchcv
    params = {'t5__max_depth':[1,2,3,4,5,None]}
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
    #print(grid.fit(X_train, Y_train))
    grid.fit(X_train, Y_train)
    
    print("grid.best_score_ :" ,grid.best_score_)
    print("grid.best_params_ : ", grid.best_params_ )
    
    # export the pickle file and saving in pipe_new.pkl file
    import pickle
    pickle.dump(pipe,open('pipe.pkl','wb'))
    
    
data = read_csv("train.csv")
X_train,X_test,Y_train,Y_test = split_data(data)
transform_data(X_train,X_test)

Pipeline(memory=None,
         steps=[('t1',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('si_age',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  [2]),
                                                 ('si_embarked',
                                                  SimpleImputer(add_indicator=False,
                

# Predict using pipeline

In [2]:
import pickle
import numpy as np

def load_pickle_and_check_result(test_input):
    pipe = pickle.load(open('pipe.pkl','rb'))
    print("1 : means survive, 0 : means not survive")
    print(pipe.predict(test_input))

# Let user gave input as
#Pclass  Sex   Age  SibSp  Parch    Fare Embarked
test_input = np.array([1, 'male', 31.0, 1, 0, 20.5, 'S'],dtype=object).reshape(1,7)
load_pickle_and_check_result(test_input)


1 : means survive, 0 : means not survive
[0]
