In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path_raw    = '../data/raw/'
dirname     = 'titanic/'
filename    = 'titanic.csv'

## Load data

In [3]:
raw_df = pd.read_csv(os.path.join(path_raw, dirname, filename))

In [4]:
print(raw_df.shape)
raw_df.head(3)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
# X features names
features_type = {'qualitative': ['PassengerId', 
                                 'Pclass',
                                 'Name',
                                 'Sex',                                 
                                 'Ticket',
                                 'Cabin',
                                 'Embarked'
                                ],
                'quantitative': ['Age',
                                 'SibSp',
                                 'Parch',
                                 'Fare'
                                ]
                }

# target name
y_name = 'Survived'

## Execution

In [6]:
path_output = '../data/output/titanic/'

In [7]:
SEED = 42

In [1]:
import sys
sys.path.append('../src/')

from process.process import octopus_process
from process.prepare import octopus_prepare
from modeling.train  import octopus_train
from evaluate.evaluate import octopus_evaluate

In [9]:
# process data, data cleaning
octo_process = octopus_process(method_missing_quanti = 'median',
                               outliers_method = 'lof',
                               alpha_sta     = 0.05)

X, y, features_type = octo_process.run(
                                    data          = raw_df,
                                    y_name        = y_name,
                                    features_type = features_type,
                                    path_output   = path_output)

2021-05-19 22:53:11,284 INFO: Started to check the features consistency
2021-05-19 22:53:11,325 INFO: Features: ['PassengerId', 'Name', 'Ticket', 'Cabin'] were removed because its distribution
2021-05-19 22:53:11,327 INFO: Consistency values finished!
2021-05-19 22:53:11,334 INFO: Feature Age was imputer with the method median value = 28.0
2021-05-19 22:53:11,336 INFO: Feature Embarked was imputer with "other"
2021-05-19 22:53:11,338 INFO: None feature were removed because the missing values
2021-05-19 22:53:11,339 INFO: Handle missing values finished!
2021-05-19 22:53:11,341 INFO: Detect outliers started
2021-05-19 22:53:11,343 INFO: Local Outlier Factor method selected
2021-05-19 22:53:11,372 INFO: Detected 146 outliers
2021-05-19 22:53:11,374 INFO: Detect outliers finished


<Figure size 432x288 with 0 Axes>

In [10]:
# data preparation for model
octo_prepare = octopus_prepare(seed = SEED,
                               method_scale = 'standard')

X_train, X_test, y_train, y_test = octo_prepare.run(X = X,
                                                    y = y,
                                                    features_type = features_type)

In [11]:
%%time
# modeling
octo_train = octopus_train(seed = SEED,
                           metric = 'accuracy',
                           njobs = -1)

models_trained = octo_train.run(X_train, y_train)

CPU times: user 7min 43s, sys: 13.3 s, total: 7min 56s
Wall time: 2min 38s


In [12]:
octo_eval = octopus_evaluate(metric = 'recall', seed = SEED)

octo_eval.run(X_train, 
              y_train,
              X_test,
              y_test,
              models_trained,
              path_output)

LR model: recall = 0.762015 (0.074992)
LRR model: recall = 0.705910 (0.070878)
RF model: recall = 0.799083 (0.050747)
XGB model: recall = 0.830560 (0.059204)


(('XGB',
  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=0.7439419453116151, gamma=0,
                gpu_id=-1, importance_type='gain', interaction_constraints='',
                learning_rate=0.021939320923203304, max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=129, n_jobs=8, num_parallel_tree=1, random_state=42,
                reg_alpha=0, reg_lambda=1, scale_pos_weight=2.0, subsample=1,
                tree_method='exact', validate_parameters=1, verbosity=0)),
        metric variable     value
 0      recall       LR  0.740741
 1   precision       LR  0.571429
 2          f1       LR  0.645161
 3    accuracy       LR  0.764706
 4         auc       LR  0.814397
 5      recall      LRR  0.685185
 6   precision      LRR  0.627119
 7          f1      LRR  0.654867
 8    accuracy      LRR  0.791444
 9         auc      LRR  

<Figure size 432x288 with 0 Axes>

<Figure size 720x288 with 0 Axes>