# Solution for "Titanic"

In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import optuna

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [None]:
df.head()

**Check missing values**

In [None]:
print(df.isnull().sum())
print('')
print(df_test.isnull().sum())

In [None]:
sns.heatmap(df.corr(),annot=True,cmap='bwr',linewidths=0.2)
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.show()

**Complete missing values in 'Embarked' with mode**

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imp.fit(np.array(df.loc[:,"Embarked"]).reshape(-1,1))
df["Embarked"] = imp.transform(np.array(df.loc[:,"Embarked"]).reshape(-1,1))

df.isnull().sum()

**Complete missing values in 'Age'**

In [None]:
# Storing indexes by honorific title
train_mr_index = df['Name'].str.contains(' Mr. ')
train_miss_index = df['Name'].str.contains(' Miss. ')
train_mrs_index = df['Name'].str.contains(' Mrs. ')
train_master_index = df['Name'].str.contains(' Master. ')
test_mr_index = df_test['Name'].str.contains(' Mr. ')
test_miss_index = df_test['Name'].str.contains(' Miss. ')
test_mrs_index = df_test['Name'].str.contains(' Mrs. ')
test_master_index = df_test['Name'].str.contains(' Master. ')

# Calculation of the average value for each honorific title
train_mr = df[df['Name'].str.contains(' Mr. ')]
train_miss = df[df['Name'].str.contains(' Miss. ')]
train_mrs = df[df['Name'].str.contains(' Mrs. ')]
train_master = df[df['Name'].str.contains(' Master. ')]
test_mr = df_test[df_test['Name'].str.contains(' Mr. ')]
test_miss = df_test[df_test['Name'].str.contains(' Miss. ')]
test_mrs = df_test[df_test['Name'].str.contains(' Mrs. ')]
test_master = df_test[df_test['Name'].str.contains(' Master. ')]

train_mr_num = train_mr['Age'].dropna().mean()
train_miss_num = train_miss['Age'].dropna().mean()
train_mrs_num = train_mrs['Age'].dropna().mean()
train_master_num = train_master['Age'].dropna().mean()
train_all_num = df['Age'].dropna().median()

test_mr_num = test_mr['Age'].dropna().mean()
test_miss_num = test_miss['Age'].dropna().mean()
test_mrs_num = test_mrs['Age'].dropna().mean()
test_master_num = test_master['Age'].dropna().mean()
test_all_num = df_test['Age'].dropna().median()

print("Mean value of honorific title 'Mr' in train data = " + str(train_mr_num))
print("Mean value of honorific title 'Miss' in train data = " + str(train_miss_num))
print("Mean value of honorific title 'Mrs' in train data = " + str(train_mrs_num))
print("Mean value of honorific title 'Master' in train data = " + str(train_master_num))
print("Median of train data = " + str(train_all_num), '\n')

print("Mean value of honorific title 'Mr' in test data = " + str(test_mr_num))
print("Mean value of honorific title 'Miss' in test data = " + str(test_miss_num))
print("Mean value of honorific title 'Mrs' in test data = " + str(test_mrs_num))
print("Mean value of honorific title 'Master' in test data = " + str(test_master_num))
print("Median of test data = " + str(test_all_num))

In [None]:
# Completion of the average value for each honorific title for the missing value "Age"
df['Age'][train_mr_index] = train_mr['Age'].fillna(32)
df['Age'][train_miss_index] = train_master['Age'].fillna(22)
df['Age'][train_mrs_index] = train_mrs['Age'].fillna(36)
df['Age'][train_master_index] = train_master['Age'].fillna(5)
df['Age'] = df['Age'].fillna(28)

df_test['Age'][test_mr_index] = test_mr['Age'].fillna(32)
df_test['Age'][test_miss_index] = test_miss['Age'].fillna(22)
df_test['Age'][test_mrs_index] = test_mrs['Age'].fillna(39)
df_test['Age'][test_master_index] = test_master['Age'].fillna(7)
df_test['Age'] = df_test['Age'].fillna(27)

df.isnull().sum()

**Complete missing values in 'Fare'**

In [None]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())

df_test.isnull().sum()

**Complete missing values in 'Cabin'**  

Analyze survival rates

In [None]:
plt.pie(df["Survived"].value_counts(), labels=["not Survived(0)", "Survived(1)"], startangle=90, counterclock=False, autopct='%1.1f%%',)

In [None]:
cabin = df[["Cabin", "Survived"]]
cabin["Cabin_init"] = cabin["Cabin"].map(lambda x:str(x)[0])

cabin["Survived"].groupby(cabin["Cabin_init"]).agg(["mean", "count"])

Missing value (n) is a lower survival rate than others and overall. So, in this case, I set those with Cabin data as 1 and those without as 0.

In [None]:
df["Cabin"] = df["Cabin"].map(lambda x: 1 if x == x else 0)
df_test["Cabin"] = df_test["Cabin"].map(lambda x: 1 if x == x else 0)

In [None]:
print(df.isnull().sum())
print('')
print(df_test.isnull().sum())

**Categorical Data Conversion**  
sex

In [None]:
df.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df_test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

embarked

In [None]:
embarked = pd.concat([df['Embarked'], df_test['Embarked']])

embarked_ohe = pd.get_dummies(embarked)

embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]

df = pd.concat([df, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)

df.drop('Embarked', axis=1, inplace=True)
df_test.drop('Embarked', axis=1, inplace=True)

In [None]:
df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
# discretization FamilySize
df['FamilySize_bin'] = 'big'
df.loc[df['FamilySize']==1,'FamilySize_bin'] = 'alone'
df.loc[(df['FamilySize']>=2),'FamilySize_bin'] = 'family'

df_test['FamilySize'] = df_test['Parch'] + df_test['SibSp'] + 1
# discretization FamilySize
df_test['FamilySize_bin'] = 'big'
df_test.loc[df_test['FamilySize']==1,'FamilySize_bin'] = 'alone'
df_test.loc[(df_test['FamilySize']>=2),'FamilySize_bin'] = 'family'

df.replace({'FamilySize_bin': {'alone': 0, 'family': 1}}, inplace=True)
df_test.replace({'FamilySize_bin': {'alone': 0, 'family': 1}}, inplace=True)

In [None]:
df.loc[:, 'TicketFreq'] = df.groupby(['Ticket'])['PassengerId'].transform('count')

df_test.loc[:, 'TicketFreq'] = df_test.groupby(['Ticket'])['PassengerId'].transform('count')

In [None]:
# Extract honorifics
df['honorific'] = df['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
# Respectful processing
df['honorific'].replace(['Col','Dr', 'Rev'], 'Rare',inplace=True) #Integrate honorifics
df['honorific'].replace('Mlle', 'Miss',inplace=True)
df['honorific'].replace('Ms', 'Miss',inplace=True)

# Extract honorifics
df_test['honorific'] = df_test['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
# Respectful processing
df_test['honorific'].replace(['Col','Dr', 'Rev'], 'Rare',inplace=True) #Integrate honorifics
df_test['honorific'].replace('Mlle', 'Miss',inplace=True)
df_test['honorific'].replace('Ms', 'Miss',inplace=True)

df = pd.get_dummies(df, drop_first=True, columns=['honorific'])
df_test = pd.get_dummies(df_test, drop_first=True, columns=['honorific'])

In [None]:
df.drop('Name', axis=1, inplace=True)
df_test.drop('Name', axis=1, inplace=True)

df.drop('Ticket', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
print((set(df.keys()) - set(df_test.keys())))

In [None]:
df_test['Survived'] = 0
df_test['honorific_Don'] = 0
df_test['honorific_Major'] = 0
df_test['honorific_Jonkheer'] = 0
df_test['honorific_Sir'] = 0
df_test['honorific_Mme'] = 0
df_test['honorific_the Countess'] = 0
df_test['honorific_Lady'] = 0

In [None]:
df_test.head()

**Building a Baseline Model**

In [None]:
target_col = 'Survived'
drop_col = ['PassengerId','Survived','Fare', 'Cabin', 'Age', 'Parch', 'FamilySize', 'SibSp']
# Retain only the features necessary for training
train_feature = df.drop(columns=drop_col)
test_feature = df_test.drop(columns=drop_col)
train_tagert = df[target_col]
# Split train data
X_train, X_test, y_train, y_test = train_test_split(
    train_feature, train_tagert, test_size=0.2, random_state=0, stratify=train_tagert)

In [None]:
survive_rate = y_train.sum()/len(y_train)
print(f'survive rate:{survive_rate}')
print(f'base line accuracy: {1 - survive_rate}')

In [None]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
print('='*20)
print('RandomForestClassifier')
print(f'accuracy of train set: {rfc.score(X_train, y_train)}')
print(f'accuracy of test set: {rfc.score(X_test, y_test)}')

xgb = XGBClassifier(random_state=0)
xgb.fit(X_train, y_train)
print('='*20)
print('XGBClassifier')
print(f'accuracy of train set: {xgb.score(X_train, y_train)}')
print(f'accuracy of train set: {xgb.score(X_test, y_test)}')

lgb = LGBMClassifier(random_state=0)
lgb.fit(X_train, y_train)
print('='*20)
print('LGBMClassifier')
print(f'accuracy of train set: {lgb.score(X_train, y_train)}')
print(f'accuracy of train set: {lgb.score(X_test, y_test)}')

lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
print('='*20)
print('LogisticRegression')
print(f'accuracy of train set: {lr.score(X_train, y_train)}')
print(f'accuracy of train set: {lr.score(X_test, y_test)}')

svc = SVC(random_state=0)
svc.fit(X_train, y_train)
print('='*20)
print('SVC')
print(f'accuracy of train set: {svc.score(X_train, y_train)}')
print(f'accuracy of train set: {svc.score(X_test, y_test)}')

In [None]:
cv = 5

def objective(trial):
    
    param_grid_rfc = {
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        'min_samples_split': trial.suggest_int("min_samples_split", 7, 15),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        'max_features': trial.suggest_int("max_features", 3, 10),
        "random_state": 0
    }

    model = RandomForestClassifier(**param_grid_rfc)
    
    # Evaluate the model with 5-Fold CV / Accuracy
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_validate(model, X=X_train, y=y_train, cv=kf)
    # Minimize, so subtract score from 1.0
    return scores['test_score'].mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(study.best_params)
print(study.best_value)
rfc_best_param = study.best_params

In [None]:
def objective(trial):
    
    param_grid_xgb = {
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 5),
        'gamma': trial.suggest_discrete_uniform("gamma", 0.1, 1.0, 0.1),
        'subsample': trial.suggest_discrete_uniform("subsample", 0.5, 1.0, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform("colsample_bytree", 0.5, 1.0, 0.1),
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        "random_state": 0
    }

    model = XGBClassifier(**param_grid_xgb)
    
    # Evaluate the model with 5-Fold CV / Accuracy
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_validate(model, X=X_train, y=y_train, cv=kf)
    # Minimize, so subtract score from 1.0
    return scores['test_score'].mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(study.best_params)
print(study.best_value)
xgb_best_param = study.best_params

In [None]:
def objective(trial):
    
    param_grid_lgb = {
        'num_leaves': trial.suggest_int("num_leaves", 3, 10),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1.0),
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        "random_state": 0
    }

    model = LGBMClassifier(**param_grid_lgb)
    
    # Evaluate the model with 5-Fold CV / Accuracy
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_validate(model, X=X_train, y=y_train, cv=kf)
    # Minimize, so subtract score from 1.0
    return scores['test_score'].mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(study.best_params)
print(study.best_value)
lgb_best_param = study.best_params

In [None]:
def objective(trial):
    
    param_grid_lr = {
        'C' : trial.suggest_int("C", 1, 100),
        "random_state": 0
    }

    model = LogisticRegression(**param_grid_lr)
    
    # Evaluate the model with 5-Fold CV / Accuracy
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_validate(model, X=X_train, y=y_train, cv=kf)
    # Minimize, so subtract score from 1.0
    return scores['test_score'].mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print(study.best_params)
print(study.best_value)
lr_best_param = study.best_params

In [None]:
def objective(trial):
    
    param_grid_svc = {
        'C' : trial.suggest_int("C", 50, 200),
        'gamma': trial.suggest_loguniform("gamma", 1e-4, 1.0),
        "random_state": 0,
        'kernel': 'rbf'
    }

    model = SVC(**param_grid_svc)
    
    # Evaluate the model with 5-Fold CV / Accuracy
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_validate(model, X=X_train, y=y_train, cv=kf)
    # Minimize, so subtract score from 1.0
    return scores['test_score'].mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print(study.best_params)
print(study.best_value)
svc_best_param = study.best_params

In [None]:
# Evaluate the model with 5-Fold CV / Accuracy
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

rfc_best = RandomForestClassifier(**rfc_best_param)
print('RandomForestClassifier')
print('='*20)
scores = cross_validate(rfc_best, X=train_feature, y=train_tagert, cv=kf)
print(f'mean:{scores["test_score"].mean()}, std:{scores["test_score"].std()}')
print('='*20)

xgb_best = XGBClassifier(**xgb_best_param)
print('XGBClassifier')
print('='*20)
scores = cross_validate(xgb_best, X=train_feature, y=train_tagert, cv=kf)
print(f'mean:{scores["test_score"].mean()}, std:{scores["test_score"].std()}')
print('='*20)

lgb_best = LGBMClassifier(**lgb_best_param)
print('LGBMClassifier')
print('='*20)
scores = cross_validate(lgb_best, X=train_feature, y=train_tagert, cv=kf)
print(f'mean:{scores["test_score"].mean()}, std:{scores["test_score"].std()}')
print('='*20)

lr_best = LogisticRegression(**lr_best_param)
print('LogisticRegression')
print('='*20)
scores = cross_validate(lr_best, X=train_feature, y=train_tagert, cv=kf)
print(f'mean:{scores["test_score"].mean()}, std:{scores["test_score"].std()}')
print('='*20)

svc_best = SVC(**svc_best_param)
print('SVC')
print('='*20)
scores = cross_validate(svc_best, X=train_feature, y=train_tagert, cv=kf)
print(f'mean:{scores["test_score"].mean()}, std:{scores["test_score"].std()}')
print('='*20)

In [None]:
from sklearn.ensemble import VotingClassifier

# Prepare classifiers for voting
estimators = [
    ('rfc', RandomForestClassifier(**rfc_best_param)),
    ('xgb', XGBClassifier(**xgb_best_param)),
    ('lgb', LGBMClassifier(**lgb_best_param)),
    ('lr', LogisticRegression(**lr_best_param)),
    ('svc', SVC(**lr_best_param))
]
voting = VotingClassifier(estimators)

print('VotingClassifier')
print('='*20)
scores = cross_validate(voting, X=train_feature, y=train_tagert, cv=kf)

In [None]:
# RandomForest
rfc_best = RandomForestClassifier(**rfc_best_param)
rfc_best.fit(train_feature, train_tagert)
# XGBoost
xgb_best = XGBClassifier(**xgb_best_param)
xgb_best.fit(train_feature, train_tagert)
# LightGBM
lgb_best = LGBMClassifier(**lgb_best_param)
lgb_best.fit(train_feature, train_tagert)
# LogisticRegression
lr_best = LogisticRegression(**lr_best_param)
lr_best.fit(train_feature, train_tagert)
# SVC
svc_best = SVC(**svc_best_param)
svc_best.fit(train_feature, train_tagert)
# prediction
pred = {
    'rfc': rfc_best.predict(test_feature),
    'xgb': xgb_best.predict(test_feature),
    'lgb': lgb_best.predict(test_feature),
    'lr': lr_best.predict(test_feature),
    'svc': svc_best.predict(test_feature)
}


**submission**

In [None]:
for key, value in pred.items():
    pd.concat(
        [
            pd.DataFrame(df_test.PassengerId, columns=['PassengerId']).reset_index(drop=True),
            pd.DataFrame(value, columns=['Survived'])
        ],
        axis=1
    ).to_csv(f'output_{key}.csv', index=False)