TITANIC DATASET - ML - NAIVE BAYES
==================================

In [6]:
%load_ext autoreload
%autoreload 2

import mlflow
import pickle
import pandas as pd 
import numpy as np 
from datetime import datetime
from functions import get_metrics

from sklearn.naive_bayes import GaussianNB

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configs

In [7]:
EXPERIMENT_TITLE = "Titanic Dataset Analyzes"

run_description = """
### Descrição

Implementação usando [Naive Bayes](https://scikit-learn.org/stable/modules/naive_bayes.html)
"""

tags = {
    "date": datetime.now(),
    "author": "Pablo Veinberg",
    "version": 1.0,
    "envoronment": "local",
    "mlflow.note.content": run_description,
    "mlflow.runName": "Naive Bayes",
    "data_source": "./../datasets/silver/train-encoded-not-normalize.parquet",
    "train_test_dataset": "./../datasets/silver/titanic-train-test-data.pkl"
}

params = {
    "token": np.random.randint(10_000,high=99_000)
    }


In [8]:
with open(tags['train_test_dataset'], 'rb') as file:
    X_train, X_test, y_train, y_test = pickle.load(file)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

## Process

In [9]:
mlflow.set_experiment(EXPERIMENT_TITLE)

dataset = pd.read_parquet(tags['data_source'])
mlflow_dataset = mlflow.data.from_pandas(dataset, \
                                            source=tags['data_source'], \
                                            name="Titanic Dataset")

  return _dataset_source_registry.resolve(


In [10]:

with mlflow.start_run():

    # Settings
    mlflow.log_input(mlflow_dataset)
    mlflow.set_tags(tags)
    mlflow.log_params(params)

    model = GaussianNB()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)


    plt_matrix_confusion_image_path = f"./../results/naive_bayes_m_confusion_{params['token']}.png"
    
    metrics = get_metrics(y_test, y_pred, \
                        plt_matrix_confusion_image_path, \
                        "Titanic - Naive Bayes Confusion Matrix")
    
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(plt_matrix_confusion_image_path)

mlflow.end_run()


  return _infer_schema(self._df)
