In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/test-file/tested.csv


# **This is kind of a standard code by scikit learn, so refer this always in case if using automatic select imputer**

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('/kaggle/input/test-file/tested.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df.drop(columns =['PassengerId','Name', 'Ticket','Cabin'], inplace = True)

In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [7]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [10]:
df.dropna(subset =['Fare'], inplace = True)

In [11]:
X = df.drop(columns = ['Survived'])
y = df['Survived']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [14]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
287,1,male,24.0,1,0,82.2667,S
387,2,male,57.0,0,0,13.0,S
210,3,male,32.0,0,0,22.525,S
93,3,male,,0,0,8.05,S
285,3,male,36.0,0,0,7.25,S


In [15]:
numerical_features = ['Age','Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer' , SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

categorical_features =['Embarked' , 'Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy ='most_frequent')),
    ('ohe' , OneHotEncoder(handle_unknown ='ignore'))
])

In [17]:
preprocessor = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [18]:
clf = Pipeline(steps =[
    ('preprocessor' , preprocessor),
    ('classifier' , LogisticRegression())
])

In [19]:
from sklearn import set_config

set_config(display ='diagram')
clf

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
param_grid ={
    'preprocessor__num__imputer__strategy' : ['mean' , 'median'],
    'preprocessor__cat__imputer__strategy' : ['most_frequent', 'constant'],
     'classifier__C' : [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(clf, param_grid, cv=10)

In [23]:
grid_search.fit(X_train, y_train)

print(f"Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__C': 0.1, 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'mean'}


In [24]:
print(f"Internal CV score: {grid_search.best_score_: 3f}")

Internal CV score:  1.000000


In [25]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending = False)
cv_results[['param_classifier__C', 'param_preprocessor__cat__imputer__strategy', 'param_preprocessor__num__imputer__strategy', 'mean_test_score']]

Unnamed: 0,param_classifier__C,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__imputer__strategy,mean_test_score
0,0.1,most_frequent,mean,1.0
1,0.1,most_frequent,median,1.0
2,0.1,constant,mean,1.0
3,0.1,constant,median,1.0
4,1.0,most_frequent,mean,1.0
5,1.0,most_frequent,median,1.0
6,1.0,constant,mean,1.0
7,1.0,constant,median,1.0
8,10.0,most_frequent,mean,1.0
9,10.0,most_frequent,median,1.0
