In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns

In [155]:
# Load Titanic dataset
titanic = sns.load_dataset('titanic')
titanic = titanic.dropna(subset=['survived'])  # Drop rows with missing target variable if any

# Define features and target
X = titanic[['age', 'sex', 'embarked', 'pclass', 'fare']]
y = titanic['survived']



In [156]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [157]:
X_train

Unnamed: 0,age,sex,embarked,pclass,fare
331,45.5,male,S,1,28.5000
733,23.0,male,S,2,13.0000
382,32.0,male,S,3,7.9250
704,26.0,male,S,3,7.8542
813,6.0,female,S,3,31.2750
...,...,...,...,...,...
106,21.0,female,S,3,7.6500
270,,male,S,1,31.0000
860,41.0,male,S,3,14.1083
435,14.0,female,S,1,120.0000


In [158]:
# Define transformations for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [159]:
numerical_transformer

In [160]:

# Define transformations for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



In [161]:
# Combine both transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['age', 'fare']),
        ('cat', categorical_transformer, ['sex', 'embarked'])
    ],
    remainder='passthrough'  # Keep other columns as they are (e.g., 'pclass')
)



In [162]:
# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])



In [163]:
pipeline

In [164]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.7988826815642458


## Hyper paramater tunning for logistic regression and finding the best imputer paramater using Grid search cv

In [165]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__penalty': ['l2'],  # L2 regularization (L1 requires 'liblinear' or 'saga' solvers)
    'classifier__solver': ['lbfgs', 'liblinear'],  # Solvers

    # Define the parameter grid for Ssimple imputer for numerical and categorical data
    'preprocessor__num__imputer__strategy': ['mean', 'median'] ,
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant']  

}


In [166]:
# Wrap the pipeline in GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',  # You can use other metrics like 'f1', 'roc_auc', etc.
    cv=5,                # Number of cross-validation folds
    verbose=1,           # To display progress
    n_jobs=-1            # Use all available processors
)

In [167]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best Hyperparameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'mean'}


## Combination of multiple paramaters and their corresponding accuracy

In [181]:
grid_search.cv_results_

{'mean_fit_time': array([0.03063493, 0.02622943, 0.03380384, 0.02549739, 0.02117839,
        0.02259784, 0.02331614, 0.02138753, 0.03061523, 0.0261569 ,
        0.02667184, 0.0313849 , 0.03007913, 0.02147698, 0.01962113,
        0.02594576, 0.02696714, 0.0244761 , 0.02640367, 0.02467055,
        0.01497383, 0.02079077, 0.01235065, 0.01716733, 0.0258357 ,
        0.03027587, 0.0269444 , 0.02433019, 0.01182156, 0.01786466,
        0.01465712, 0.01515703, 0.02207961, 0.02238355, 0.02613783,
        0.02628207, 0.01590371, 0.01509891, 0.01577301, 0.01121507]),
 'std_fit_time': array([1.20661122e-02, 5.27814202e-03, 8.21376202e-03, 4.53167001e-03,
        3.39639708e-03, 5.16890515e-03, 2.74047322e-03, 4.58912361e-03,
        3.29613757e-03, 3.47924397e-03, 4.25848770e-03, 8.92205962e-03,
        4.72549520e-03, 3.61030343e-03, 2.53230304e-03, 6.45662801e-03,
        6.76900917e-03, 2.03767272e-03, 2.34193882e-03, 2.56613637e-03,
        9.05163279e-04, 4.81407905e-03, 6.70339947e-03, 2.342

In [175]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_classifier__C','param_classifier__penalty','param_classifier__solver','param_preprocessor__cat__imputer__strategy','param_preprocessor__num__imputer__strategy','mean_test_score']]

Unnamed: 0,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__imputer__strategy,mean_test_score
8,0.1,l2,lbfgs,most_frequent,mean,0.789264
20,1.0,l2,liblinear,most_frequent,mean,0.786467
21,1.0,l2,liblinear,most_frequent,median,0.786467
19,1.0,l2,lbfgs,constant,median,0.786467
17,1.0,l2,lbfgs,most_frequent,median,0.786467
23,1.0,l2,liblinear,constant,median,0.786467
11,0.1,l2,lbfgs,constant,median,0.786467
22,1.0,l2,liblinear,constant,mean,0.786467
10,0.1,l2,lbfgs,constant,mean,0.786457
16,1.0,l2,lbfgs,most_frequent,mean,0.785059


In [173]:
# Best cross-validated score
print("Best Cross-Validated Score:", grid_search.best_score_)

Best Cross-Validated Score: 0.7892642568698907


In [171]:
# Use the best estimator to predict
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)


In [172]:
# Evaluate performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       105
           1       0.76      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.79      0.79      0.79       179

