In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path_raw    = '../data/raw/'
dirname     = 'titanic/'
filename    = 'titanic.csv'

## Load data

In [3]:
raw_df = pd.read_csv(os.path.join(path_raw, dirname, filename))

In [4]:
#raw_df.info()

In [5]:
print(raw_df.shape)
raw_df.head(3)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [6]:
features_type = {'qualitative': ['PassengerId', 
                                 #'Survived',
                                 'Pclass',
                                 'Name',
                                 'Sex',                                 
                                 'Ticket',
                                 'Cabin',
                                 'Embarked'
                                ],
                'quantitative': ['Age',
                                 'SibSp',
                                 'Parch',
                                 'Fare'
                                ]
                }

In [7]:
path_output = '../data/output/titanic/'

In [14]:
import sys
import os

sys.path.append('../src/')

from utils import log

from preprocessing import preprocess_data
from outlier_detection import detect_outliers
from statistical_analysis import statistical_analysis

class octopus:
    
    def __init__(self,
                 data,
                 y_name,
                 features_type,
                 outliers_method,
                 alpha_sta,
                 path_output
                ):

        self.data          = data.copy()
        self.y_name        = y_name
        self.features_type = features_type.copy()
        self.outliers_method = outliers_method
        self.alpha         = alpha_sta
        self.path_output   = path_output
        
        self.X = None
        self.y = None
        
        self.html = """<html><head>"""
        self.html += """<link rel = "stylesheet" href = "style.css"/>"""
        self.path_html = os.path.join(self.path_output, 'report.html')
    
    def renderize_html(self):
        
        self.html += "<br></body></html>"

        with open(self.path_html, 'w') as out:
            out.write(self.html)
                
    def run(self):
        
        logger = log(self.path_output, 'logs.txt')
        
        # Preprocess data        
        preprocess = preprocess_data(data           = self.data,
                                     y_name         = self.y_name,
                                     features_type  = self.features_type,
                                     html           = self.html,
                                     logger         = logger)

        self.X, self.y, self.features_type, html = preprocess.run()
        self.html = html
        
        # =================
        # Outlier detection
        detect_out = detect_outliers(X             = self.X,
                                     features_type = self.features_type,
                                     method        = self.outliers_method,
                                     logger        = logger)
        
        outliers = detect_out.run() 
        self.X = self.X[~outliers]
        self.y = self.y[~outliers]
        
        # HTML report about outliers
        if self.outliers_method == 'adjbox':
            name = 'Adjusted Boxplot for skewed distribution'
        elif self.outliers_method == 'lof':
            name = 'Local Outlier Factor (LOF)'
        elif self.outliers_method == 'isolation_forest':
            name = 'Isolarion Forest'
            
        str_ = name + " method used<br>Total outliers found: " + str(outliers.sum())
        self.html += "<h2><center>Outlier detection:</center></h2>"
        self.html += str_
        
        # =================
        # statistical analysis
        self.html += "<h2><center>Statistical Analysis:</center></h2>"
        
        sta = statistical_analysis(
                           X      = self.X, 
                           y      = self.y,
                           y_name = self.y_name,
                           features_type = self.features_type,
                           alpha  = self.alpha,
                           html   = self.html,
                           path_output = self.path_output,
                           logger = logger
                          )

        self.html = sta.run()
        # =================
        
        # Make the HTML file
        self.renderize_html()
        
        return self.X, self.y, self.features_type

In [15]:
octo = octopus(data          = raw_df,
               y_name        = 'Survived',
               features_type = features_type,
               outliers_method = 'lof',
               alpha_sta     = 0.05,
               path_output   = path_output
               )

In [16]:
X, y, ft = octo.run()

2021-05-15 23:23:36,141 INFO: Started to check the features consistency
2021-05-15 23:23:36,183 INFO: Features: ['PassengerId', 'Name', 'Ticket', 'Cabin'] were removed because its distribution
2021-05-15 23:23:36,185 INFO: Consistency values finished!
2021-05-15 23:23:36,189 INFO: Feature Age was imputer with the method median value = 28.0
2021-05-15 23:23:36,192 INFO: Feature Embarked was imputer with "other"
2021-05-15 23:23:36,193 INFO: None feature were removed because the missing values
2021-05-15 23:23:36,194 INFO: Handle missing values finished!
2021-05-15 23:23:36,197 INFO: Detect outliers started
2021-05-15 23:23:36,200 INFO: Local Outlier Factor method selected
2021-05-15 23:23:36,228 INFO: Detected 146 outliers
2021-05-15 23:23:36,230 INFO: Detect outliers finished


<Figure size 432x288 with 0 Axes>

In [None]:
class modeling:
    
    def __init__(self)