In [853]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.impute import KNNImputer

In [854]:
# sns.get_dataset_names()

In [855]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [856]:
df.drop(columns=['class','who','adult_male','deck','embark_town','alive','alone'],inplace=True)


In [857]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [858]:
df['survived'].value_counts()

survived
0    549
1    342
Name: count, dtype: int64

In [859]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [860]:
df.shape

(891, 8)

In [861]:
# Check the percentage of null values

In [862]:
df.isnull().sum() / df.shape[0] * 100

survived     0.000000
pclass       0.000000
sex          0.000000
age         19.865320
sibsp        0.000000
parch        0.000000
fare         0.000000
embarked     0.224467
dtype: float64

211.0191247463081

In [863]:
print("Numer of unique values in age column: ",df['age'].nunique())
# print("Numer of unique values in embarked column : ",df['embarked'].nunique())
print("Numer of unique values in fare column: ",df['fare'].nunique())

Numer of unique values in age column:  88
Numer of unique values in fare column:  248


In [864]:
# print("Numer of unique values in age column: ",df['age'].unique())
# print("Numer of unique values in embarked column : ",df['embarked'].unique())
# print("Numer of unique values in fare column: ",df['fare'].nunique())

In [865]:
x=df.iloc[:,1:]
y = df['survived']
x_train,x_test , y_train ,y_test = train_test_split(x,y,random_state=42)

# Pipeline for Imputing Missing values of age column by calculating the nearest neighbour sung the feature age and fare

In [866]:
# Age column : Since missing values are 19 % we use KNN imputer

In [867]:
# Define transformations for numerical columns
numerical_transformer = Pipeline(steps=[
    ('age_impute',KNNImputer(n_neighbors=2,weights='distance')),
    ('scaler', StandardScaler())
])


In [868]:
numerical_transformer

In [869]:

# Define transformations for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



In [870]:
# Combine both transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['age','fare']),
        ('cat', categorical_transformer, ['sex', 'embarked'])
    ],
    remainder='passthrough'  # Keep other columns as they are (e.g., 'pclass')
)



In [871]:
# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=100,C=0.1))
])

# Best Hyperparameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs',
#                        'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__age_impute__n_neighbors': 2}


In [872]:
pipeline.fit(x_train , y_train)

## Make prediction using the pipeline

In [873]:
# Make predictions and evaluate the model
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.8071748878923767


In [874]:
print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('age_impute',
                                                                   KNNImputer(n_neighbors=2,
                                                                              weights='distance')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                      

# Hyper paramater tunnig of logistic regression and knn imputer (i.e the value of K)

In [875]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__penalty': ['l2'],  # L2 regularization (L1 requires 'liblinear' or 'saga' solvers)
    'classifier__solver': ['lbfgs', 'liblinear'],  # Solvers,
        'classifier__max_iter': [100,200,300],  


    # Define the parameter grid for Ssimple imputer for numerical and categorical data
    'preprocessor__num__age_impute__n_neighbors': [2,3,4,5,6] ,
        'preprocessor__num__age_impute__weights': ['distance','uniform'] ,

    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant']  

}


In [876]:
# Wrap the pipeline in GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='accuracy',  # You can use other metrics like 'f1', 'roc_auc', etc.
    cv=5,                # Number of cross-validation folds
    verbose=1,           # To display progress
    n_jobs=-1            # Use all available processors
)

In [877]:
y_train

298    1
884    0
247    1
478    0
305    1
      ..
106    1
270    0
860    0
435    1
102    0
Name: survived, Length: 668, dtype: int64

In [878]:
# Fit GridSearchCV
grid_search.fit(x_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits
Best Hyperparameters: {'classifier__C': 0.1, 'classifier__max_iter': 100, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__age_impute__n_neighbors': 2, 'preprocessor__num__age_impute__weights': 'uniform'}


## Combination of multiple paramaters and their corresponding accuracy

In [879]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
# cv_results
cv_results[['param_classifier__C','param_classifier__penalty','param_classifier__solver','param_preprocessor__cat__imputer__strategy','param_preprocessor__num__age_impute__n_neighbors','mean_test_score']]

Unnamed: 0,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__age_impute__n_neighbors,mean_test_score
131,0.1,l2,lbfgs,constant,2,0.806845
121,0.1,l2,lbfgs,most_frequent,2,0.806845
171,0.1,l2,lbfgs,constant,2,0.806845
211,0.1,l2,lbfgs,constant,2,0.806845
161,0.1,l2,lbfgs,most_frequent,2,0.806845
...,...,...,...,...,...,...
62,0.01,l2,liblinear,most_frequent,3,0.743979
22,0.01,l2,liblinear,most_frequent,3,0.743979
34,0.01,l2,liblinear,constant,4,0.742476
74,0.01,l2,liblinear,constant,4,0.742476
