In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate

import optuna

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install feature-engine

In [None]:
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import PRatioEncoder
from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import MeanMedianImputer

In [None]:
df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
X_train = df.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket'], axis=1)
y_train = df['Survived']

X_test_id = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')['PassengerId']
X_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv').drop(
    columns=['PassengerId', 'Name', 'Ticket'], axis=1)
sub_sample = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

In [None]:
X_train['Cabin'] = X_train['Cabin'].str[0]
X_test['Cabin'] = X_test['Cabin'].str[0]

In [None]:
X_test.head()

### Missing Data Imputation
* Cabin: Most of the data are missing values.
    * Fill missing values with 'Missing' to capture the importance of missingness
* Age, Fare, Embarked: The percentage of missing values is small
    * Fill median for Age, Fare and mode for Embarked to capture the randomness of their missing

In [None]:
X_train.isnull().mean()

In [None]:
pre_pipe = Pipeline([

    ('median_imputer', MeanMedianImputer(imputation_method='median',
                                         variables=['Age', 'Fare'])),
    
    ('missing_imputer', CategoricalImputer(variables=['Cabin'])),
    
    ('mode_imputer', CategoricalImputer(imputation_method='frequent',
                                        variables=['Embarked'])),

    ])

In [None]:
pre_pipe.fit(X_train, y_train)

In [None]:
X_train = pre_pipe.transform(X_train)
X_test = pre_pipe.transform(X_test)

### Categorical Valiable Encoding 
* **One Hot Encoding**: Suitable for linear models
* **Integer/Ordinal/Label Encoding**: Not suitable for linear models (Work well enough for tree based models)
* **Ordered Ordinal Encoding**: Can be useful for linear models (Monotonic relationship between categories and target. But may lead to overfitting)
* **Frequency Encoding**: Not suitable for linear models  (Work well enough for tree based models)
* **Mean Encoding**: Can be useful for linear models (Monotonic relationship between categories and target. But may lead to overfitting)
* **Probability Ratio Encoding**: Can be useful for linear models (Monotonic relationship between categories and target. But may lead to overfitting)
* **Weight of Evidence**: Can be useful for linear models (Monotonic relationship between categories and target. It orders categories on logistic scale which is natural for logistic regression. But may lead to overfitting)

In [None]:
oh_enc = OneHotEncoder(top_categories=None, variables=['Sex', 'Cabin', 'Embarked'], drop_last=True).fit(X_train)
ig_enc = OrdinalEncoder(encoding_method='arbitrary', variables=['Sex', 'Cabin', 'Embarked']).fit(X_train)
od_enc = OrdinalEncoder(encoding_method='ordered', variables=['Sex', 'Cabin', 'Embarked']).fit(X_train, y_train)
fq_enc = CountFrequencyEncoder(encoding_method='frequency', variables=['Sex', 'Cabin', 'Embarked']).fit(X_train)
me_enc = MeanEncoder(variables=['Sex', 'Cabin', 'Embarked']).fit(X_train, y_train)
pr_enc = PRatioEncoder(encoding_method='ratio', variables=['Sex', 'Cabin', 'Embarked']).fit(X_train, y_train)
we_enc = WoEEncoder(variables=['Sex', 'Cabin', 'Embarked']).fit(X_train, y_train)

In [None]:
def run_logistic_cv(X_train, y_train):
    kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1234)
    model = LogisticRegression(max_iter=1000)
    scores = cross_validate(model, X=X_train, y=y_train, cv=kf)
    return scores['test_score'].mean()

### One Hot Encoding is the best for the accuracy

In [None]:
print('Accuracy Comparison')
print('One Hot Encoding: ' + str(run_logistic_cv(oh_enc.transform(X_train), y_train)))
print('Integer Encoding: ' + str(run_logistic_cv(ig_enc.transform(X_train), y_train)))
print('Ordered Integer Encoding: ' + str(run_logistic_cv(od_enc.transform(X_train), y_train)))
print('Frequency Encoding: ' + str(run_logistic_cv(fq_enc.transform(X_train), y_train)))
print('Mean Encoding: ' + str(run_logistic_cv(me_enc.transform(X_train), y_train)))
print('Probability Ratio Encoding: ' + str(run_logistic_cv(pr_enc.transform(X_train), y_train)))
print('Weight of Evidence: ' + str(run_logistic_cv(we_enc.transform(X_train), y_train)))

### Parameter Tuning

In [None]:
def objective(trial):
    
    param_grid_lr = {
        'C' : trial.suggest_int("C", 1, 100),
        "random_state": 0
    }

    model = LogisticRegression(**param_grid_lr, max_iter=1000)
    
    kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
    scores = cross_validate(model, X=oh_enc.transform(X_train), y=y_train, cv=kf)
    return scores['test_score'].mean()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print(study.best_params)
print(study.best_value)
lr_best_param = study.best_params

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
model = LogisticRegression(**lr_best_param, max_iter=1000)  
model.fit(oh_enc.transform(X_train), y_train)
y_test_pred = model.predict(oh_enc.transform(X_test))

In [None]:
sub = pd.DataFrame(y_test_pred, index=X_test_id).reset_index().rename(columns={0: 'Survived'})
sub.to_csv('optuna_lm.csv', index=False)

In [None]:
sub