# Library

In [53]:
import pandas as pd
import numpy as np
from pycaret.classification import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

# Config

In [50]:
DATA_DIR = '../data/'
MODEL_INPUT_DIR = DATA_DIR + '05_model_input/'
MODEL_OUTPUT_DIR = DATA_DIR + '07_model_output/'
DATASET_DIR = 'titanic/'

INPUT_DIR = MODEL_INPUT_DIR + DATASET_DIR
OUTPUT_DIR = MODEL_OUTPUT_DIR + DATASET_DIR

In [54]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
ID_COL = 'PassengerId'
Y_COL = 'Survived'

# Load data

In [4]:
base_df = pd.read_pickle(INPUT_DIR + 'train_test.pkl')
base_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,


# Prepare modeling

In [6]:
for c in base_df.columns:
    print("  '" + c + "',")

  'PassengerId',
  'Survived',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked',
  'flg',
  'fix_Name',
  'len_Name',
  'fix_Ticket',
  'len_Ticket',
  'first_chars_Ticket',
  'fix_Cabin',
  'len_Cabin',
  'first_chars_Cabin',


In [35]:
feature_cols = [
  # 'PassengerId',
  # 'Survived',
  'Pclass',
  # 'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  # 'Ticket',
  'Fare',
  # 'Cabin',
  'Embarked',
  # 'flg',
  # 'fix_Name',
  # 'len_Name',
  # 'fix_Ticket',
  # 'len_Ticket',
  'first_chars_Ticket',
  # 'fix_Cabin',
  # 'len_Cabin',
  # 'first_chars_Cabin',
]

train_df = base_df[base_df['flg'] == 'train']
test_df = base_df[base_df['flg'] == 'test']

In [36]:
train_df[feature_cols].isnull().sum()

Pclass                  0
Sex                     0
Age                   177
SibSp                   0
Parch                   0
Fare                    0
Embarked                2
first_chars_Ticket      0
dtype: int64

# Modeling

In [37]:
s = setup(train_df.loc[:, feature_cols + [Y_COL]], target = Y_COL)

Unnamed: 0,Description,Value
0,session_id,7409
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(891, 9)"
5,Missing Values,True
6,Numeric Features,2
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


In [39]:
top_5 = compare_models(n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.838,0.884,0.7597,0.8145,0.784,0.6547,0.6581,0.017
gbc,Gradient Boosting Classifier,0.8282,0.8827,0.7183,0.8229,0.7641,0.6301,0.6366,0.039
ada,Ada Boost Classifier,0.8203,0.8492,0.7555,0.7803,0.7664,0.6205,0.6221,0.038
lr,Logistic Regression,0.8171,0.8583,0.7352,0.7872,0.758,0.6114,0.6145,0.05
rf,Random Forest Classifier,0.8171,0.875,0.743,0.7796,0.7584,0.6115,0.6148,0.211
dt,Decision Tree Classifier,0.8122,0.8114,0.7803,0.7531,0.7638,0.6084,0.6119,0.006
ridge,Ridge Classifier,0.8107,0.0,0.6937,0.8001,0.7402,0.5925,0.5988,0.007
lda,Linear Discriminant Analysis,0.8075,0.8602,0.6937,0.7922,0.7371,0.5862,0.5919,0.007
et,Extra Trees Classifier,0.7946,0.8482,0.7432,0.7376,0.7367,0.5687,0.5731,0.17
knn,K Neighbors Classifier,0.7046,0.7363,0.574,0.637,0.5981,0.3669,0.3721,0.014


In [40]:
blend_soft = blend_models(top_5, method='soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8571,0.9642,0.8,0.8333,0.8163,0.6995,0.6999
1,0.8571,0.9253,0.92,0.7667,0.8364,0.7115,0.7208
2,0.7937,0.7943,0.75,0.72,0.7347,0.566,0.5663
3,0.9032,0.9397,0.7917,0.95,0.8636,0.7896,0.7975
4,0.9032,0.9024,0.8333,0.9091,0.8696,0.7929,0.7948
5,0.7903,0.9079,0.6667,0.7619,0.7111,0.5477,0.5507
6,0.871,0.9452,0.7083,0.9444,0.8095,0.7149,0.7319
7,0.7581,0.8092,0.6667,0.6957,0.6809,0.4862,0.4865
8,0.8065,0.8268,0.625,0.8333,0.7143,0.5724,0.586
9,0.8871,0.8925,0.7917,0.9048,0.8444,0.7565,0.7606


In [44]:
top_5

[LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.1, max_depth=-1,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                random_state=7409, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=3,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=100,
                            n_iter_no_change=None, presort='deprecated',
         

# Experiment logging

In [42]:
# !mlflow ui

[2022-04-10 06:11:06 +0000] [17633] [INFO] Starting gunicorn 20.1.0
[2022-04-10 06:11:06 +0000] [17633] [INFO] Listening at: http://127.0.0.1:5000 (17633)
[2022-04-10 06:11:06 +0000] [17633] [INFO] Using worker: sync
[2022-04-10 06:11:06 +0000] [17635] [INFO] Booting worker with pid: 17635
^C
[2022-04-10 06:13:06 +0000] [17633] [INFO] Handling signal: int
[2022-04-10 06:13:07 +0000] [17635] [INFO] Worker exiting (pid: 17635)


# Submission

In [45]:
# blend_soft

In [49]:
predictions = predict_model(blend_soft, test_df.loc[:, feature_cols])
test_df[Y_COL] = predictions['Label']
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,...,Q,test,"Kelly, Mr. James",16,330911,6,3,-1,2,
1,893,0,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,...,S,test,"Wilkes, Mrs. James (Ellen Needs)",32,363272,6,3,-1,2,
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,...,Q,test,"Myles, Mr. Thomas Francis",25,240276,6,2,-1,2,
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,...,S,test,"Wirz, Mr. Albert",16,315154,6,3,-1,2,
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,...,S,test,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",44,3101298,7,3,-1,2,


In [58]:
test_df.loc[:, [ID_COL, Y_COL]].to_csv(OUTPUT_DIR + 'submission.csv', index=False)