Download the Titanic dataset from 'https://www.kaggle.com/datasets/yasserh/titanic-dataset'. 
Create a folder named 'datasets' in your Project Root.
Place the Titanic-Dataset.csv in the datasets folder.

In [1]:
import os
import pandas as pd

full_path = os.path.join('datasets', 'Titanic-Dataset.csv')

titanic_data = pd.read_csv(full_path)  # Was getting an error here because I was trying to use os.open()

titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [69]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  f1_score

# Creation of custom transformer for removing unimportant columns
class ColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_remove):
        self.columns_to_remove = columns_to_remove
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X.drop(self.columns_to_remove, axis = 1)

# creation of numeric and cat columns 
num_attr = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_attr = ['Sex', 'Embarked']

# num_attr cleaning pipeline
num_pipeline = Pipeline([
    ('numimputer', SimpleImputer(strategy = "mean")),
    ('scaler', StandardScaler())
])

# cat_attr cleaning pipeline
cat_pipeline = Pipeline([
    ('catimputer', SimpleImputer(strategy = "most_frequent")),
    ('encoder', OneHotEncoder(sparse_output = False, handle_unknown = "ignore"))
])

# Preparation pipeline of num and cat pipelines
prep_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attr),
    ('cat', cat_pipeline, cat_attr)
])

# Full pipeline
full_pipeline = Pipeline([
    ('colremove', ColumnRemover(columns_to_remove = ['PassengerId', 'Name', 'Ticket', 'Cabin'])),
    ('prep', prep_pipeline)
])

# Splitting Features and Labels
X = titanic_data.drop('Survived', axis = 1)
y = titanic_data['Survived']

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_transformed = full_pipeline.fit_transform(X_train)

param_grid = {
    'C': [0.001, 0.01, 0.1],
    'max_iter': [100, 150, 175, 200],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

model = LogisticRegression()
gridcv = GridSearchCV(model, param_grid = param_grid, cv = 5, scoring = "f1")
gridcv.fit(X_transformed, y_train)

X_test_transformed = full_pipeline.transform(X_test)

y_pred = gridcv.predict(X_test_transformed)

f1_score(y_test, y_pred)

0.7552447552447552

In [70]:
print(gridcv.best_params_)

{'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}


KNeighborsClassifier->(f1_score: 0.736, best_params_:('n_neighbors': 5, 'weights': 'distance'))
LogisticRegression->(f1_score: 0.755, best_params_:('C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'))
SVC->(f1_score: 0.760, best_params_:('C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'))
DecisionTreeClassifier->(f1_score: 0.744, best_params_:('criterion': 'entropy', 'splitter': 'best'))