# Library

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

# Config

In [2]:
DATA_DIR = '../data/'
MODEL_INPUT_DIR = DATA_DIR + '05_model_input/'
MODEL_OUTPUT_DIR = DATA_DIR + '07_model_output/'
DATASET_DIR = 'titanic/'

INPUT_DIR = MODEL_INPUT_DIR + DATASET_DIR
OUTPUT_DIR = MODEL_OUTPUT_DIR + DATASET_DIR

In [3]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
ID_COL = 'PassengerId'
Y_COL = 'Survived'

# Load data

In [5]:
base_df = pd.read_pickle(INPUT_DIR + 'train_test.pkl')
base_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,"Braund, Mr. Owen Harris",23,A/5 21171,9,A,-1,2,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51,PC 17599,8,P,C85,3,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,"Heikkinen, Miss. Laina",22,STON/O2. 3101282,16,S,-1,2,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44,113803,6,1,C123,4,
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,"Allen, Mr. William Henry",24,373450,6,3,-1,2,


# Prepare modeling

In [6]:
for c in base_df.columns:
    print("  '" + c + "',")

  'PassengerId',
  'Survived',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked',
  'flg',
  'fix_Name',
  'len_Name',
  'fix_Ticket',
  'len_Ticket',
  'first_chars_Ticket',
  'fix_Cabin',
  'len_Cabin',
  'first_chars_Cabin',


In [7]:
feature_cols = [
  # 'PassengerId',
  # 'Survived',
  'Pclass',
  # 'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  # 'Ticket',
  'Fare',
  # 'Cabin',
  'Embarked',
  # 'flg',
  # 'fix_Name',
  # 'len_Name',
  # 'fix_Ticket',
  'len_Ticket',
  'first_chars_Ticket',
  # 'fix_Cabin',
  'len_Cabin',
  'first_chars_Cabin',
]

train_df = base_df[base_df['flg'] == 'train']
test_df = base_df[base_df['flg'] == 'test']

In [8]:
train_df[feature_cols].isnull().sum()

Pclass                  0
Sex                     0
Age                   177
SibSp                   0
Parch                   0
Fare                    0
Embarked                2
len_Ticket              0
first_chars_Ticket      0
len_Cabin               0
first_chars_Cabin       0
dtype: int64

# Modeling

In [10]:
s = setup(
    train_df.loc[:, feature_cols + [Y_COL]], 
    target = Y_COL,
    log_experiment = True,
    experiment_name = 'test1',
    log_plots = True,
    # log_profile = True,
    # log_data = True
)

Unnamed: 0,Description,Value
0,session_id,452
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(891, 12)"
5,Missing Values,True
6,Numeric Features,2
7,Categorical Features,8
8,Ordinal Features,False
9,High Cardinality Features,False


In [11]:
top_3 = compare_models(n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8283,0.891,0.739,0.8148,0.7733,0.6359,0.6396,0.023
lightgbm,Light Gradient Boosting Machine,0.8137,0.876,0.743,0.7807,0.759,0.6079,0.6107,0.01
lr,Logistic Regression,0.8092,0.8724,0.731,0.7831,0.7523,0.5979,0.6024,0.264
ridge,Ridge Classifier,0.8011,0.0,0.7308,0.7656,0.7443,0.5822,0.5859,0.003
rf,Random Forest Classifier,0.7979,0.8669,0.735,0.7578,0.7437,0.5772,0.5801,0.076
lda,Linear Discriminant Analysis,0.7962,0.8727,0.7388,0.753,0.7419,0.5741,0.5782,0.004
ada,Ada Boost Classifier,0.7931,0.8622,0.7555,0.7348,0.7424,0.57,0.5731,0.019
et,Extra Trees Classifier,0.7835,0.843,0.719,0.7397,0.7262,0.5476,0.5507,0.066
dt,Decision Tree Classifier,0.7802,0.772,0.723,0.7332,0.7259,0.5428,0.5451,0.003
knn,K Neighbors Classifier,0.7144,0.7318,0.5822,0.6645,0.6137,0.3908,0.3975,0.008


In [12]:
blend_soft = blend_models(top_3, method='soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8095,0.9011,0.72,0.7826,0.75,0.5966,0.5979
1,0.8254,0.9168,0.8,0.7692,0.7843,0.6377,0.6381
2,0.8571,0.8884,0.72,0.9,0.8,0.691,0.7014
3,0.8387,0.8618,0.75,0.8182,0.7826,0.6548,0.6564
4,0.7742,0.8627,0.64,0.7619,0.6957,0.5183,0.5233
5,0.9194,0.9535,0.88,0.9167,0.898,0.8313,0.8318
6,0.8065,0.8097,0.68,0.8095,0.7391,0.5871,0.5928
7,0.7903,0.8605,0.72,0.75,0.7347,0.5615,0.5618
8,0.871,0.9557,0.88,0.8148,0.8462,0.7353,0.7369
9,0.8548,0.8973,0.76,0.8636,0.8085,0.6924,0.6961


In [13]:
finalize_model(blend_soft)

VotingClassifier(estimators=[('gbc',
                              GradientBoostingClassifier(ccp_alpha=0.0,
                                                         criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.1,
                                                         loss='deviance',
                                                         max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                             

# Experiment logging

In [17]:
# !mlflow ui --host 0.0.0.0

# Submission

In [18]:
# blend_soft

In [19]:
predictions = predict_model(blend_soft, test_df.loc[:, feature_cols])
test_df[Y_COL] = predictions['Label']
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,flg,fix_Name,len_Name,fix_Ticket,len_Ticket,first_chars_Ticket,fix_Cabin,len_Cabin,first_chars_Cabin
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,...,Q,test,"Kelly, Mr. James",16,330911,6,3,-1,2,
1,893,0,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,...,S,test,"Wilkes, Mrs. James (Ellen Needs)",32,363272,6,3,-1,2,
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,...,Q,test,"Myles, Mr. Thomas Francis",25,240276,6,2,-1,2,
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,...,S,test,"Wirz, Mr. Albert",16,315154,6,3,-1,2,
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,...,S,test,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",44,3101298,7,3,-1,2,


In [20]:
test_df.loc[:, [ID_COL, Y_COL]].to_csv(OUTPUT_DIR + 'submission.csv', index=False)