In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

In [None]:
titanic_train = pd.read_csv('../input/titanic/train.csv')
titanic_train.sample()

In [None]:
titanic_test = pd.read_csv('../input/titanic/test.csv')
titanic_test.sample()

In [None]:
titanic_train.info()

In [None]:
titanic_test.info()

# Train Data

#### Missing Value

In [None]:
train = titanic_train.copy()
train.isna().sum()/len(train.index)*100

#### Feature's Value Checking

In [None]:
new_train = train['Name'].str.split(" ", n = 2, expand = True)
train["First Name"] = new_train[0]
train["Name Title"] = new_train[1]
train["Last Name"] = new_train[2]
train.head()

#### Drop Columns

In [None]:
train.drop(columns=['Cabin', 'PassengerId', 'Name', 'Ticket', 'First Name', 'Last Name'], inplace = True)
train.head()

# Test Data

#### Missing Value

In [None]:
test = titanic_test.copy()
test.isna().sum()/len(test.index)*100

#### Feature's Value Checking

In [None]:
new_test = test['Name'].str.split(" ", n = 2, expand = True)
test["First Name"] = new_test[0]
test["Name Title"] = new_test[1]
test["Last Name"] = new_test[2]
test.head()

#### Drop Columns

In [None]:
test.drop(columns=['Cabin', 'Name', 'Ticket', 'First Name', 'Last Name'], inplace = True)

#### Fill Missing Value

In [None]:
impute_mode = SimpleImputer(strategy = 'most_frequent')
test[['Embarked']] = impute_mode.fit_transform(test[['Embarked']])

impute_iter = IterativeImputer(max_iter = 10, random_state = 0)
test[['Age']] = impute_iter.fit_transform(test[['Age']])

impute_iter = IterativeImputer(max_iter = 10, random_state = 0)
test[['Fare']] = impute_iter.fit_transform(test[['Fare']])

test.isna().sum()/len(test.index)*100

# PreProcessing

#### Preprocessing Scheme

- OneHotEncoding: Sex, Name Title
    * Simple Imputer Most Frequent: Embarked
- Iterative Impute: Age
- Robust Scaling: Fare
- PassThrough: Pclass, SibSp, Parch
- Target: Survived

In [None]:
mode_onehot_pipe = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), ['Sex', 'Name Title']),
    ('mode_onehot_pipe', mode_onehot_pipe, ['Embarked']),
    ('robust', RobustScaler(), ['Fare']),
    ('iterative', IterativeImputer(max_iter = 10, random_state = 0), ['Age'])], remainder = 'passthrough')

#### Define Target Data

In [None]:
train['Survived'].value_counts()/train.shape[0]*100

* It's definitely imbalanced data but I'm not intended to process more further to handling it.

#### Splitting Data

In [None]:
X_train = train.drop('Survived', axis = 1)
y_train = train['Survived']

Xtest = test.drop('PassengerId', axis = 1).copy()

X_train.shape, Xtest.shape

# Modeling

In [None]:
rf = RandomForestClassifier(random_state = 8888)

rf_pipe = Pipeline([('transformer', transformer), ('rf', rf)])

def model_evaluation(model, metric):
    model_cv = cross_val_score(model, X_train, y_train, cv = StratifiedKFold(n_splits = 5), scoring = metric)
    return model_cv

rf_pipe_cv = model_evaluation(rf_pipe, 'accuracy')

score_mean = [rf_pipe_cv.mean()]
score_std = [rf_pipe_cv.std()]
method_name = ['Random Forest Classifier']
cv_summary = pd.DataFrame({
    'method': method_name,
    'mean score': score_mean,
    'std score': score_std
})
cv_summary

# HyperParameter Tuning

In [None]:
rf_pipe = Pipeline([('transformer', transformer), ('model', rf)])

hyperparam_space = {
    'model__n_estimators': [100, 200],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [11, 13, 15],
    'model__min_samples_leaf': [3, 5, 7, 9],
    'model__random_state': [8888]
}

grid_rf = GridSearchCV(
                rf_pipe,
                param_grid = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'accuracy',
                n_jobs = -1)

grid_rf.fit(X_train, y_train)

print('best score', grid_rf.best_score_)
print('best param', grid_rf.best_params_)

# Submission

In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": grid_rf.best_estimator_.predict(Xtest)
    })