# Selecting best model in pipline

To select the best model when using multiple models in a pipline, you can use techniques like cross-validation and evalution metrics to compare their performance, Here's an example of how to accomplish this on the Titanic dataset.

In [17]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# load the titanic dataset
titanic_data = sns.load_dataset('titanic')

# select features and target variable
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# train test split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random forest', RandomForestClassifier(random_state=42)),
    ('gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Support vactor Machine', SVC(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # create a pipeline for each model 
    pipline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
    # perform cross-validations
    scores = cross_val_score(pipline, X_train, y_train, cv=5)

    # calculate mean accuracy
    mean_accuracy = scores.mean()

    # fit the pipline on the training data
    pipline.fit(X_train, y_train)

    # Make prediction on the test data
    y_pred = pipline.predict(X_test)

    # calculate the accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # print the performane metrics
    print('model:', name)
    print('cross-validation Accuracy:', mean_accuracy)
    print('Test accuracy:', accuracy)
    print()

    # check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipline

# retrieve the best model
print('best model:', best_model)

model: Random forest
cross-validation Accuracy: 0.7991529597163399
Test accuracy: 0.8379888268156425

model: gradient Boosting
cross-validation Accuracy: 0.8061952132374668
Test accuracy: 0.7988826815642458

model: Support vactor Machine
cross-validation Accuracy: 0.8160248202501723
Test accuracy: 0.8044692737430168

model: Logistic Regression
cross-validation Accuracy: 0.7977839062346105
Test accuracy: 0.8100558659217877

best model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
