In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../src/')

In [2]:
path_raw    = '../data/raw/'
dirname     = 'titanic/'
filename    = 'titanic.csv'

## Load data

In [3]:
raw_df = pd.read_csv(os.path.join(path_raw, dirname, filename))

In [4]:
print(raw_df.shape)
raw_df.head(3)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Execution

In [5]:
# X features names
features_type = {'qualitative': ['PassengerId', 
                                 'Pclass',
                                 'Name',
                                 'Sex',                                 
                                 'Ticket',
                                 'Cabin',
                                 'Embarked'
                                ],
                'quantitative': ['Age',
                                 'SibSp',
                                 'Parch',
                                 'Fare'
                                ]
                }

# target name
y_name = 'Survived'

In [6]:
path_output = '../data/output/titanic/'

test_size = 0.25
min_missing = 0.25
outliers_method = 'lof'
alpha_sta = 0.05

strategy_missing = 'median'
method_scale = 'standard'
metric_train = 'accuracy'

seed = 42
njobs = -1

In [7]:
data = raw_df.copy()

In [9]:
from datetime import datetime
from octopus import OctopusProcess
from octopus import OctopusPrepare
from octopus import OctopusTrain
from octopus import OctopusEvaluate

In [11]:
octoProcess = OctopusProcess(test_size, 
                             min_missing,
                             outliers_method,
                             alpha_sta,
                             seed)

train, test, features_type, html = octoProcess.process_data(data, 
                                                            y_name,
                                                            features_type,
                                                            path_output)

X_train, y_train = train
X_test, y_test = test

2021-05-23 18:24:23,381 INFO: Started to check the features consistency
2021-05-23 18:24:23,410 INFO: Features: ['PassengerId', 'Name', 'Ticket', 'Cabin'] were removed because its distribution
2021-05-23 18:24:23,411 INFO: Consistency values finished!
2021-05-23 18:24:23,416 INFO: Feature Embarked was imputer with "other"
2021-05-23 18:24:23,418 INFO: None feature were removed because the missing values
2021-05-23 18:24:23,419 INFO: Check the missing values finished!
2021-05-23 18:24:23,421 INFO: Detect outliers started
2021-05-23 18:24:23,422 INFO: Local Outlier Factor method selected
2021-05-23 18:24:23,455 INFO: Detected 80 outliers
2021-05-23 18:24:23,456 INFO: Detect outliers finished


<Figure size 432x288 with 0 Axes>

In [12]:
octoPrepare = OctopusPrepare(strategy_missing, 
                             method_scale)
preparessor = octoPrepare.prepare_pipeline(features_type)

In [13]:
start = datetime.now()

octoTrain = OctopusTrain(metric = metric_train, 
                         seed = seed,
                         njobs = njobs)
models_trained = octoTrain.train(X_train, y_train, preparessor)

finish = datetime.now()
print('Time execution training models:', finish - start)

Logistic Regression is going to be fitted
Logistic Regression fitted
Regularized Logistic Regression is going to be fitted
Regularized Logistic Regression fitted
Random Forest is going to be fitted
Random Forest fitted
XGBoost is going to be fitted
XGBoost fitted
Time execution training models: 0:02:50.798944


In [17]:
# evaluate
OctoEval = OctopusEvaluate(metric = metric_train,
                           seed = seed)

best_model, metrics_df = OctoEval.evaluate(X_train, 
                                           y_train,
                                           X_test,
                                           y_test,
                                           models_trained,
                                           path_output)

LR model: accuracy = 0.795956 (0.034484)
LRR model: accuracy = 0.795956 (0.034484)
RF model: accuracy = 0.827077 (0.048432)
XGB model: accuracy = 0.810820 (0.035628)


<Figure size 432x288 with 0 Axes>

<Figure size 720x288 with 0 Axes>

In [None]:
results = {}
results['data_train'] = (X_train, y_train)
results['data_test'] = (X_test, y_test)
results['models_trained'] = models_trained
results['best_model'] = best_model
results['metrics'] = metrics_df

In [7]:
from octopus import octopus_ml

In [8]:
octo = octopus_ml(
                method_missing_quanti = method_missing_quanti,
                outliers_method = outliers_method,
                alpha_sta = alpha_sta,
                seed = seed,
                method_scale = method_scale,
                metric_train = metric,
                njobs = njobs)

In [9]:
results = octo.run(data          = raw_df,
                   y_name        = y_name,
                   features_type = features_type,
                   path_output   = path_output)

2021-05-20 22:35:34,228 INFO: Started to check the features consistency
2021-05-20 22:35:34,260 INFO: Features: ['PassengerId', 'Name', 'Ticket', 'Cabin'] were removed because its distribution
2021-05-20 22:35:34,261 INFO: Consistency values finished!
2021-05-20 22:35:34,266 INFO: Feature Age was imputer with the method median value = 28.0
2021-05-20 22:35:34,267 INFO: Feature Embarked was imputer with "other"
2021-05-20 22:35:34,268 INFO: None feature were removed because the missing values
2021-05-20 22:35:34,269 INFO: Handle missing values finished!
2021-05-20 22:35:34,270 INFO: Detect outliers started
2021-05-20 22:35:34,271 INFO: Local Outlier Factor method selected
2021-05-20 22:35:34,302 INFO: Detected 146 outliers
2021-05-20 22:35:34,303 INFO: Detect outliers finished


LR model: accuracy = 0.801006 (0.041433)
LRR model: accuracy = 0.799253 (0.036570)
RF model: accuracy = 0.826039 (0.034669)
XGB model: accuracy = 0.824318 (0.048278)


<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 720x288 with 0 Axes>

In [12]:
results['best_model']

('RF',
 RandomForestClassifier(max_depth=8, max_features=0.37384777848686024,
                        min_samples_leaf=9, min_samples_split=73,
                        n_estimators=248, random_state=42))

what is missing?

Improve documentation
More options to handle missing values (imputer)