TITANIC DATASET - ML - KNN
==========================

In [10]:
%load_ext autoreload
%autoreload 2

import mlflow
import pickle
import pandas as pd 
import numpy as np 

from datetime import datetime
from functions import get_metrics

from matplotlib import pyplot as plt 
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Configs

In [11]:
EXPERIMENT_TITLE = "Titanic Dataset Analyzes"

run_description = """
### Descrição

Implementação usando [KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.htmll)
"""

tags = {
    "date": datetime.now(),
    "author": "Pablo Veinberg",
    "version": 1.0,
    "envoronment": "local",
    "mlflow.note.content": run_description,
    "mlflow.runName": "KNeighborsClassifier",
    "data_source": "./../datasets/silver/train-encoded-not-normalize.parquet",
    "train_test_dataset": "./../datasets/silver/titanic-train-test-data.pkl"
}

params = {
    "token": np.random.randint(10_000,high=99_000),
    "model_params": {
        "weights": ["uniform", "distance"],
        "n_neighbors": range(1,31, 2),
        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
        "p": [1, 2, 3], 
        "metric": ["minkowski", "eucliedean", "canberra"]
    }
}

sns.set_style("darkgrid")


## Load

In [12]:
dataset = pd.read_parquet(tags['data_source'])
mlflow_dataset = mlflow.data.from_pandas(dataset, \
                                            source=tags['data_source'], \
                                            name="Titanic Dataset")

  return _dataset_source_registry.resolve(


In [13]:
with open('./../datasets/silver/titanic-train-test-data.pkl', 'rb') as file:
    X_train, X_test, y_train, y_test = pickle.load(file)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

## Rodar experimentos

In [14]:
# # run all then select best and log

# accuracies = []
# for k in range(params['min_n_neighbors'], params['max_n_neighbors'], 2):
#     for p in (params['min_p'], params['max_p']):
#         model = KNeighborsClassifier(
#             n_neighbors=k, \
#             p=p,
#             metric=params['metric']
#         )

#         model.fit(X_train, y_train)

#         y_pred = model.predict(X_test)
#         accuracies.append(
#             (k, p, accuracy_score(y_test, y_pred)))



### Analisar melhores parâmetros 

In [15]:
# sorted_by_accuracy = sorted(accuracies, key=lambda tup: tup[2], reverse=True)

# params['n_neighbors'] = sorted_by_accuracy[0][0]
# params['p'] = sorted_by_accuracy[0][1]
# tags['accuracies'] = accuracies

In [16]:
# plt_best_k_figpath = f"./../results/knn_best_k_{params['token']}.png"
# sns.lineplot(x=[f"k={row[0]}, p={row[1]}" for row in accuracies], y=[row[2] for row in accuracies])
# plt.xticks(rotation=90)
# plt.title("Melhor k para este conjunto de dados.")
# plt.savefig(plt_best_k_figpath);

## Processar melhor resultado

In [17]:
mlflow.set_experiment(EXPERIMENT_TITLE)

with mlflow.start_run():

    # Settings
    mlflow.log_input(mlflow_dataset)
    mlflow.set_tags(tags)
    mlflow.log_params(params)

    grid = GridSearchCV(estimator=KNeighborsClassifier(), \
                        param_grid=params["model_params"])
    grid.fit(X_train, y_train)

    model = KNeighborsClassifier(**grid.best_params_)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    
    plot_confusion_matrix_path = f"./../results/conf_matrix_knn_{params['token']}.png"
    metrics = get_metrics(y_test, y_pred, plot_confusion_matrix_path, "Matrix Confusion - Titanic Dataset")
    
    mlflow.log_param("best_params", grid.best_params_)
    mlflow.log_metrics(metrics)    
    mlflow.log_artifact(plot_confusion_matrix_path)
    
mlflow.end_run()

Traceback (most recent call last):
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 302, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 395, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 1303, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/mlflow/store/tracking/file_store.py", line 1296, in _read_helper
    result = read_yaml(root, file_name)
  File "/home/pablo_veinberg/.local/lib/python3.10/site-packages/mlflow/utils/file_utils.py", line 303, in read_yaml
    raise MissingConfigException(f"Yaml file '{file_path}' does not exist.")
mlflow.e