In [234]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns

In [235]:
# Load Titanic dataset
titanic = sns.load_dataset('titanic')
titanic = titanic.dropna(subset=['survived'])  # Drop rows with missing target variable if any

# Define features and target
X = titanic[['age', 'sex', 'embarked', 'pclass', 'fare','sibsp','parch']]
y = titanic['survived']



In [236]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [237]:
X_train

Unnamed: 0,age,sex,embarked,pclass,fare,sibsp,parch
331,45.5,male,S,1,28.5000,0,0
733,23.0,male,S,2,13.0000,0,0
382,32.0,male,S,3,7.9250,0,0
704,26.0,male,S,3,7.8542,1,0
813,6.0,female,S,3,31.2750,4,2
...,...,...,...,...,...,...,...
106,21.0,female,S,3,7.6500,0,0
270,,male,S,1,31.0000,0,0
860,41.0,male,S,3,14.1083,2,0
435,14.0,female,S,1,120.0000,1,2


In [238]:
# Define transformations for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [239]:
numerical_transformer

In [240]:

# Define transformations for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



In [241]:
# Combine both transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['age', 'fare']),
        ('cat', categorical_transformer, ['sex', 'embarked'])
    ],
    remainder='passthrough'  # Keep other columns as they are (e.g., 'pclass')
)



In [242]:
# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])



In [243]:
print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                       

In [244]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.8100558659217877


## Hyper paramater tunning for logistic regression and finding the best imputer paramater using Grid search cv

In [245]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__penalty': ['l2'],  # L2 regularization (L1 requires 'liblinear' or 'saga' solvers)
    'classifier__solver': ['lbfgs', 'liblinear'],  # Solvers

    # Define the parameter grid for Ssimple imputer for numerical and categorical data
    'preprocessor__num__imputer__strategy': ['mean', 'median'] ,
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant']  

}


In [246]:
# Wrap the pipeline in GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',  # You can use other metrics like 'f1', 'roc_auc', etc.
    cv=5,                # Number of cross-validation folds
    verbose=1,           # To display progress
    n_jobs=-1            # Use all available processors
)

In [247]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best Hyperparameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'median'}


## Combination of multiple paramaters and their corresponding accuracy

In [248]:
grid_search.cv_results_

{'mean_fit_time': array([0.03296685, 0.02300196, 0.01882358, 0.01989126, 0.01653776,
        0.01680536, 0.01396437, 0.01660762, 0.02347884, 0.02180915,
        0.01828108, 0.02179055, 0.01589737, 0.01675129, 0.01526184,
        0.0165586 , 0.01890044, 0.02141895, 0.02454033, 0.0216856 ,
        0.01684651, 0.01596169, 0.01482124, 0.01477656, 0.02036753,
        0.02255421, 0.02420702, 0.02466097, 0.01545262, 0.01477656,
        0.01427736, 0.01485014, 0.02182417, 0.02282906, 0.02633643,
        0.02623401, 0.01540051, 0.01615257, 0.01564708, 0.01475554]),
 'std_fit_time': array([0.00590963, 0.00564158, 0.00143624, 0.00356387, 0.00530405,
        0.00117785, 0.00186557, 0.00283253, 0.00106216, 0.00167476,
        0.00062037, 0.0017458 , 0.00233144, 0.00272185, 0.0012128 ,
        0.00154056, 0.00294844, 0.00185503, 0.0039474 , 0.00150053,
        0.0015623 , 0.00124283, 0.00167884, 0.00082854, 0.00074826,
        0.00269498, 0.00091023, 0.00191648, 0.00043536, 0.00082426,
        0.000

In [249]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_classifier__C','param_classifier__penalty','param_classifier__solver','param_preprocessor__cat__imputer__strategy','param_preprocessor__num__imputer__strategy','mean_test_score']]

Unnamed: 0,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__imputer__strategy,mean_test_score
11,0.1,l2,lbfgs,constant,median,0.796307
9,0.1,l2,lbfgs,most_frequent,median,0.796307
13,0.1,l2,liblinear,most_frequent,median,0.794908
10,0.1,l2,lbfgs,constant,mean,0.794898
14,0.1,l2,liblinear,constant,mean,0.794898
8,0.1,l2,lbfgs,most_frequent,mean,0.794898
12,0.1,l2,liblinear,most_frequent,mean,0.794898
15,0.1,l2,liblinear,constant,median,0.793499
18,1.0,l2,lbfgs,constant,mean,0.792071
23,1.0,l2,liblinear,constant,median,0.792071


In [250]:
# Best cross-validated score
print("Best Cross-Validated Score:", grid_search.best_score_)

Best Cross-Validated Score: 0.7963065103910175


In [251]:
# Use the best estimator to predict
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)


In [252]:
# Evaluate performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

