In [1]:
import os
import sys

import pandas as pd

from ydata_profiling import ProfileReport

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
# from pandas.testing import assert_frame_equal

sys.path.append(os.path.abspath(".."))
from src.preprocessing import TitanicPreprocessor

import mlflow

import joblib

os.makedirs("pickle_files", exist_ok=True)


## Overview
### Este notebook contém as seguintes seções:
- Seção 1: EDA
- Seção 2: Feature Engineering
- Seção 3: Modelagem
- Seção 4: Preparação de pipeline de pre-processing para uso na api

### Seção 1 - EDA

Primeiramente, analisa-se o dataset que apresenta Ground Truth para entendermos a característica geral dos dados.

In [2]:
gt_dataset = pd.read_csv('./data/train.csv')
gt_dataset.columns = gt_dataset.columns.str.lower()
gt_dataset

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
profile = ProfileReport(gt_dataset,title='Titanic Dataset', explorative=True)
profile.to_file('./profile.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:00<00:00, 254.21it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Com o auxílio do relatório, podemos já fazer algumas observações:
- 'cabin' tem um número excessivo de missing values, então será descartado
- há significativas correlações de 'survived' com: 'fare', 'pclass' e 'sex'
- várias colunas têm grande desbalanceamento, o que poderia ser tratado com, por exemplo, resampling numa análise mais aprofundada
- 'ticket' não tem qualquer missing value e tem valores repetidos, podendo assim ser utilizado com um proxy para famílias e ter maior sucesso nessa \
função de proxy quando comparado a se extrair o sobrenome da coluna 'name', já que pode haver sobrenomes populares, comuns a pessoas desconhecidas

### Seção 2 - Feature Engineering

Além de 'cabin', eliminamos aqui também 'passengerid' que é só um identificador (poderíamos fazer uma análise mais aprofundada para entender se a coluna relaciona de alguma forma com posição no barco, etc..). Também removemos 'ticket' para fins de simplificação.

In [4]:
gt_dataset.drop(columns=['cabin','passengerid','ticket'],inplace=True)

E agora podemos excluir as linhas que possuem campos nulos sem eliminar praticamente todo o dataset, que seria o que aconteceria se não deletássemos a coluna 'cabin' antes.

In [5]:
gt_dataset.dropna(inplace=True)

Dado o contexto do dado, sabemos que pode ser uma boa ideia tratar a idade por grupos, pois a evacuação priorizava crianças. Assim, faremos um encoding por grupo de idade, com uma resolução de 10 anos.

In [6]:
age_bins = [age_bin for age_bin in range(0, int(gt_dataset['age'].max())+10, 10)]

gt_dataset["age_group"] = pd.cut(gt_dataset["age"], bins=age_bins, right=False).cat.codes
gt_dataset.drop(columns='age',inplace=True)
joblib.dump(gt_dataset.index.to_series().tolist(),'../tests/valid_idx.pkl')
gt_dataset

Unnamed: 0,survived,pclass,name,sex,sibsp,parch,fare,embarked,age_group
0,0,3,"Braund, Mr. Owen Harris",male,1,0,7.2500,S,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,0,0,7.9250,S,2
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,0,0,8.0500,S,3
...,...,...,...,...,...,...,...,...,...
885,0,3,"Rice, Mrs. William (Margaret Norton)",female,0,5,29.1250,Q,3
886,0,2,"Montvila, Rev. Juozas",male,0,0,13.0000,S,2
887,1,1,"Graham, Miss. Margaret Edith",female,0,0,30.0000,S,1
889,1,1,"Behr, Mr. Karl Howell",male,0,0,30.0000,C,2


Finalizando o feature engineering para esta análise simplificada, vamos aplicar mais alguns scalings, encodings e remover colunas que não interessam.

In [7]:
cols_to_scale = ["sibsp", "parch", "fare"]

scaler = MinMaxScaler()
gt_dataset[cols_to_scale] = scaler.fit_transform(gt_dataset[cols_to_scale])

gt_dataset = pd.get_dummies(gt_dataset, columns=["sex", "embarked"], drop_first=True)
gt_dataset.drop(columns='name',inplace=True)
gt_dataset

Unnamed: 0,survived,pclass,sibsp,parch,fare,age_group,sex_male,embarked_Q,embarked_S
0,0,3,0.2,0.000000,0.014151,2,True,False,True
1,1,1,0.2,0.000000,0.139136,3,False,False,False
2,1,3,0.0,0.000000,0.015469,2,False,False,True
3,1,1,0.2,0.000000,0.103644,3,False,False,True
4,0,3,0.0,0.000000,0.015713,3,True,False,True
...,...,...,...,...,...,...,...,...,...
885,0,3,0.0,0.833333,0.056848,3,False,True,False
886,0,2,0.0,0.000000,0.025374,2,True,False,True
887,1,1,0.0,0.000000,0.058556,1,False,False,True
889,1,1,0.0,0.000000,0.058556,2,True,False,False


### Seção 3 - Modelagem
Como se trata de um dataset Kaggle de competição, o test dataset não possui Ground Truth e não tem muita serventia para nós. Dessa forma, para tentar mitigar overfitting, vamos prosseguir com uma análise K-Fold no gt_dataset, para que os modelos sejam avaliados para diversos splits.

In [8]:
X = gt_dataset.drop(columns=["survived"])
y = gt_dataset["survived"]

In [9]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel="rbf", probability=True),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Bagging": BaggingClassifier(random_state=42),
}

In [10]:
# K-Fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

Agora instanciaremos o MLFlow para logar os modelos com suas métricas e metadados, e com uma automação simples salvaremos o que tiver melhor pontuação.

In [11]:
mlflow.set_experiment("titanic-models")

2025/06/20 23:12:01 INFO mlflow.tracking.fluent: Experiment with name 'titanic-models' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Pericles/Projects/insider/notebooks/mlruns/488789429135153128', creation_time=1750471921255, experiment_id='488789429135153128', last_update_time=1750471921255, lifecycle_stage='active', name='titanic-models', tags={}>

In [12]:
best_score = 0
best_model = None
best_model_name = ""


for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Avaliação por f1-score e K-Fold
        scores = cross_val_score(model, X, y, cv=cv, scoring="f1")
        mean_f1 = scores.mean()
        std_f1 = scores.std()

        # Treinamento
        model.fit(X, y)

        # Logging no MLFlow
        mlflow.log_params(model.get_params())

        mlflow.log_metric("f1_mean", mean_f1)
        mlflow.log_metric("f1_std", std_f1)

        mlflow.sklearn.log_model(
            sk_model=model,
            name=name,
            registered_model_name=f"{name.replace(' ', '_')}_Titanic"
        )

        print(f"{name} → f1-score médio: {mean_f1:.4f} (± {std_f1:.4f})")
        print("")

        if mean_f1 > best_score:
            best_score = mean_f1
            best_model = model
            best_model_name = name        

if best_model:
    filename = f"pickle_files/selected_model.pkl"
    joblib.dump(best_model, filename)
    print(f"\nMelhor modelo salvo em: {filename} (f1-score = {best_score:.4f})")            

Successfully registered model 'Random_Forest_Titanic'.
Created version '1' of model 'Random_Forest_Titanic'.


Random Forest → f1-score médio: 0.7411 (± 0.0507)



Successfully registered model 'Decision_Tree_Titanic'.
Created version '1' of model 'Decision_Tree_Titanic'.


Decision Tree → f1-score médio: 0.7455 (± 0.0547)



Successfully registered model 'SVM_Titanic'.
Created version '1' of model 'SVM_Titanic'.


SVM → f1-score médio: 0.7204 (± 0.0500)



Successfully registered model 'Gradient_Boosting_Titanic'.
Created version '1' of model 'Gradient_Boosting_Titanic'.


Gradient Boosting → f1-score médio: 0.7621 (± 0.0682)



Successfully registered model 'KNN_Titanic'.
Created version '1' of model 'KNN_Titanic'.


KNN → f1-score médio: 0.7459 (± 0.0696)



Successfully registered model 'Naive_Bayes_Titanic'.
Created version '1' of model 'Naive_Bayes_Titanic'.


Naive Bayes → f1-score médio: 0.7207 (± 0.0455)





Bagging → f1-score médio: 0.7468 (± 0.0711)


Melhor modelo salvo em: pickle_files/selected_model.pkl (f1-score = 0.7621)


Successfully registered model 'Bagging_Titanic'.
Created version '1' of model 'Bagging_Titanic'.


### Seção 4: Pipeline de pré-processamento

Aqui com o intuito de padronizar o pré-processamento, exportamos a pipeline de pré-processamento pra o pickle file `preprocessor.pkl` que replica todos os passos feitos neste notebook, a ser utilizada pela API.

In [13]:
pipeline_preproc = TitanicPreprocessor(scaler,age_bins)

In [14]:
gt_dataset_pipeline = pd.read_csv('./data/train.csv')
gt_dataset_pipeline.columns = gt_dataset_pipeline.columns.str.lower()
gt_dataset_pipeline.drop(columns=['cabin','passengerid'],inplace=True)
gt_dataset_pipeline.dropna(inplace=True)
gt_dataset_pipeline = pipeline_preproc.transform(gt_dataset_pipeline)
gt_dataset_pipeline

Unnamed: 0,pclass,sibsp,parch,fare,age_group,sex_male,embarked_Q,embarked_S
0,3,0.2,0.000000,0.014151,2,True,False,True
1,1,0.2,0.000000,0.139136,3,False,False,False
2,3,0.0,0.000000,0.015469,2,False,False,True
3,1,0.2,0.000000,0.103644,3,False,False,True
4,3,0.0,0.000000,0.015713,3,True,False,True
...,...,...,...,...,...,...,...,...
885,3,0.0,0.833333,0.056848,3,False,True,False
886,2,0.0,0.000000,0.025374,2,True,False,True
887,1,0.0,0.000000,0.058556,1,False,False,True
889,1,0.0,0.000000,0.058556,2,True,False,False


Confirmando que o dataset pré-processado utilizando pipeline é igual ao dataset pré-processado sem usar pipeline.

In [15]:
if gt_dataset_pipeline.equals(gt_dataset.drop(columns='survived')):
    print("## Pipeline de pré-processamento validada! ##")
    joblib.dump(pipeline_preproc, "pickle_files/preprocessor.pkl")

## Pipeline de pré-processamento validada! ##


Salvando predições para testar se o output do API (ambiente de produção) está exatamente igual ao do notebook (ambiente de desenvovimento)

In [16]:
gt_dataset['predictions'] = best_model.predict(gt_dataset_pipeline)

In [17]:
gt_dataset['predictions'].to_pickle('../tests/valid_idx_predictions.pkl')