# Piplines

In [5]:
# import libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# load the titanic data
titanic_data = sns.load_dataset('titanic')

# select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# split the data into train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for imputing missing values
numeric_features = ['age', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

#  create a pipline with a preprocessor and RandomForestClassifier
pipline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# fit the pipline on training data
pipline.fit(X_train, y_train)

# make prediction on the test data
y_pred = pipline.predict(X_test)

# calculate accurecy 
print('accuracy_score:', accuracy_score(y_test, y_pred))

accuracy_score: 0.7821229050279329


# Hyperparamter tuning in pipline

In [6]:
# import librareis
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# load the dataset
titanic_data = sns.load_dataset('titanic')

# select features and target variables
X = titanic_data.drop('survived', axis=1)
y = titanic_data['survived']

# train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the pipline
pipline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('model', RandomForestClassifier(random_state=42))
])

# define the hyperparameters to tune
hyperparamerters = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2,5,10]
}

# perform grid search cross-validation
grid_search = GridSearchCV(pipline, hyperparamerters, cv=5)
grid_search.fit(X_train, y_train)

# get the best model
best_model = grid_search.best_estimator_

# make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# calculate accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

# print the best hyperparameters
print('Best hyperparameters:', grid_search.best_params_)

Accuracy: 1.0
Best hyperparameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 100}
