# Library

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

# Config

In [2]:
DATA_DIR = '../data/'
MODEL_INPUT_DIR = DATA_DIR + '05_model_input/'
MODEL_OUTPUT_DIR = DATA_DIR + '07_model_output/'
DATASET_DIR = 'titanic/'

INPUT_DIR = MODEL_INPUT_DIR + DATASET_DIR
OUTPUT_DIR = MODEL_OUTPUT_DIR + DATASET_DIR

In [3]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
ID_COL = 'PassengerId'
Y_COL = 'Survived'

# Load data

In [5]:
base_df = pd.read_pickle(INPUT_DIR + 'train_test.pkl')
base_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,


# Prepare modeling

In [6]:
for c in base_df.columns:
    print("  '" + c + "',")

  'PassengerId',
  'Survived',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked',
  'flg',
  'fix_Name',
  'len_Name',
  'fix_Ticket',
  'len_Ticket',
  'first_chars_Ticket',
  'fix_Cabin',
  'len_Cabin',
  'first_chars_Cabin',


In [17]:
feature_cols = [
  # 'PassengerId',
  # 'Survived',
  'Pclass',
  # 'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  # 'Ticket',
  'Fare',
  # 'Cabin',
  'Embarked',
  # 'flg',
  # 'fix_Name',
  # 'len_Name',
  # 'fix_Ticket',
  'len_Ticket',
  'first_chars_Ticket',
  # 'fix_Cabin',
  'len_Cabin',
  'first_chars_Cabin',
]

train_df = base_df[base_df['flg'] == 'train']
test_df = base_df[base_df['flg'] == 'test']

In [29]:
train_df[feature_cols].isnull().sum()

Pclass                  0
Sex                     0
Age                   177
SibSp                   0
Parch                   0
Fare                    0
Embarked                2
len_Ticket              0
first_chars_Ticket      0
len_Cabin               0
first_chars_Cabin       0
dtype: int64

# Modeling

In [38]:
s = setup(
    train_df.loc[:, feature_cols + [Y_COL]], 
    target = Y_COL,
    log_experiment = True,
    experiment_name = 'test1',
    log_plots = True,
    # log_profile = True,
    # log_data = True
)

Unnamed: 0,Description,Value
0,session_id,5624
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(891, 12)"
5,Missing Values,True
6,Numeric Features,2
7,Categorical Features,8
8,Ordinal Features,False
9,High Cardinality Features,False


In [39]:
top_3 = compare_models(n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8202,0.8652,0.7272,0.803,0.7587,0.6162,0.6222,0.195
lightgbm,Light Gradient Boosting Machine,0.8185,0.8708,0.7185,0.7962,0.7518,0.6099,0.6152,0.018
gbc,Gradient Boosting Classifier,0.8089,0.8581,0.6897,0.791,0.7345,0.5869,0.5923,0.044
dt,Decision Tree Classifier,0.8056,0.792,0.7268,0.7647,0.7416,0.5865,0.5901,0.007
ridge,Ridge Classifier,0.8041,0.0,0.7068,0.7712,0.7361,0.5812,0.5839,0.005
lr,Logistic Regression,0.8009,0.8445,0.6985,0.7704,0.7301,0.5735,0.5774,0.345
et,Extra Trees Classifier,0.8009,0.8509,0.723,0.7607,0.7384,0.5781,0.5815,0.194
lda,Linear Discriminant Analysis,0.7977,0.8373,0.711,0.7551,0.731,0.5695,0.5714,0.009
ada,Ada Boost Classifier,0.7961,0.8264,0.7232,0.7463,0.7332,0.5684,0.57,0.039
knn,K Neighbors Classifier,0.6758,0.7304,0.5298,0.5921,0.5558,0.3033,0.3063,0.014


In [40]:
blend_soft = blend_models(top_3, method='soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7778,0.8232,0.64,0.7619,0.6957,0.5227,0.5277
1,0.8889,0.9563,0.88,0.8462,0.8627,0.7695,0.7699
2,0.873,0.9103,0.875,0.8077,0.84,0.735,0.7366
3,0.7903,0.8783,0.6667,0.7619,0.7111,0.5477,0.5507
4,0.871,0.9057,0.75,0.9,0.8182,0.7195,0.7266
5,0.871,0.8871,0.8333,0.8333,0.8333,0.7281,0.7281
6,0.8387,0.8421,0.6667,0.8889,0.7619,0.6437,0.6589
7,0.7742,0.795,0.625,0.75,0.6818,0.509,0.5141
8,0.8548,0.8728,0.75,0.8571,0.8,0.6869,0.6906
9,0.871,0.8805,0.7083,0.9444,0.8095,0.7149,0.7319


In [41]:
finalize_model(blend_soft)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
        

# Experiment logging

In [43]:
# !mlflow ui --host 0.0.0.0

# Submission

In [44]:
# blend_soft

In [45]:
predictions = predict_model(blend_soft, test_df.loc[:, feature_cols])
test_df[Y_COL] = predictions['Label']
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,...,Q,test,"Kelly, Mr. James",16,330911,6,3,-1,2,
1,893,0,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,...,S,test,"Wilkes, Mrs. James (Ellen Needs)",32,363272,6,3,-1,2,
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,...,Q,test,"Myles, Mr. Thomas Francis",25,240276,6,2,-1,2,
3,895,1,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,...,S,test,"Wirz, Mr. Albert",16,315154,6,3,-1,2,
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,...,S,test,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",44,3101298,7,3,-1,2,


In [46]:
test_df.loc[:, [ID_COL, Y_COL]].to_csv(OUTPUT_DIR + 'submission.csv', index=False)