# Pipeline in Machine Learning
A pipeline in machine learning is a sequence of data processing steps that are executed in a specific order. It is used to automate the process of data preprocessing, feature engineering model selection, and hyperparameter tuning.

In [7]:
%%time
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

# load data from sns
df = sns.load_dataset('titanic')

# X and y selection
X = df[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df['survived']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)

# define column transformer for imputing missing values
num_features = ['age', 'fare']
cat_features = ['pclass', 'sex', 'embarked']

num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                  ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

# create pipeline with the preprocessor and randomforestclassifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier())])

# fit the model
pipeline.fit(X_train, y_train)

# predict
y_pred = pipeline.predict(X_test)

# evaluate
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8379888268156425
CPU times: total: 625 ms
Wall time: 1.77 s


## Hyper-parameter Tuning in Pipeline

In [8]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Titanic dataset from Seaborn
titanic_data = sns.load_dataset('titanic')

# Select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(random_state=42))
])

# Define the hyperparameters to tune
hyperparameters = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [None, 5, 10, 30],
    'model__min_samples_split': [2, 5, 10, 15]
}

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, hyperparameters, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Accuracy: 0.8212290502793296
Best Hyperparameters: {'model__max_depth': 30, 'model__min_samples_split': 5, 'model__n_estimators': 100}
