# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [37]:
import json
from operator import itemgetter
import os
import sys

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table

# 2. Defining Constants and Globals

In [38]:
COMPLETE_DATASET_REPORT_PATH = "../reports/complete-dataset/"
EACH_DATASET_REPORT_PATH = "../reports/each-dataset/"
INTERIM_REPORT_PATH = "../reports/interim/"
PLOTS_FEATURE_SELECTION_PATH = "../plots/feature_selection/"
REPORT_PATH = "../reports/"
DATASETS = [
    "albert-einstein",
    "beneficencia-portuguesa",
    "hospital-de-clinicas",
    "sirio-libanes",
    "grupo-fleury"
    ]

FEATURE_SELECTION_MODELS = [
    "select-k-best"
]
N_FEATURES = 15

report_interim = {}

# 3. Auxiliar Functions

# 4. Reading Complete Dataset Feature Selection Report

In [39]:
with open(INTERIM_REPORT_PATH+'feature-selection-results.json', 'r') as fp:
    feature_selection_report = json.load(fp)

In [40]:
models_names = [
    "Decision Tree",
    "Random Forest",
    "KNN" ,
    # "Logistic Regression",
    # "SVM"
]

In [48]:
def get_row(ml_results):
    n_hospitals = len(ml_results)
    accuracy = 0
    precision = 0
    recall = 0
    f1_score = 0
    tp = 0
    tn = 0
    for hospital, metrics in ml_results.items():
        accuracy += float(metrics["accuracy"])
        precision += float(metrics["precision"])
        recall += float(metrics["recall"])
        f1_score += float(metrics["f1-score"])
        tp += float(metrics["confusion matrix"]["tp"])
        tn += float(metrics["confusion matrix"]["tn"])
    
    avg_acc = round(accuracy/n_hospitals,3)
    avg_prec = round(precision/n_hospitals,3)
    avg_rec =round(recall/n_hospitals,3)
    avg_f1 =round(f1_score/n_hospitals,3)

    acc=0
    prec=0
    rec = 0
    f1 = 0

    for hospital, metrics in ml_results.items():
        acc += pow(abs(float(metrics["accuracy"])-avg_acc),2)
        prec += pow(abs(float(metrics["precision"])-avg_prec),2)
        rec += pow(abs(float(metrics["recall"])-avg_rec),2)
        f1 += pow(abs(float(metrics["f1-score"])-avg_f1),2)

    std_var_acc = round(pow((acc/(len(ml_results))),1/2),3)
    std_var_prec = round(pow((prec/(len(ml_results))),1/2),3)
    std_var_rec = round(pow((rec/(len(ml_results))),1/2),3)
    std_var_f1 = round(pow((f1/(len(ml_results))),1/2),3)
    
    tp = round(tp/n_hospitals,3)
    tn = round(tn/n_hospitals,3)
    fp = round(1 - tp/n_hospitals,3)
    fn = round(1 - tn/n_hospitals,3)

    return [
        str(f"{avg_acc} +- {std_var_acc}"),
        str(f"{avg_prec} +- {std_var_prec}"),
        str(f"{avg_rec} +- {std_var_rec}"),
        str(f"{avg_f1} +- {std_var_f1}"),
        tp,
        tn,
        fp,
        fn,
    ]


In [49]:
feature_selection_results = {}
for model,results in feature_selection_report.items():
    data = []
    columns = [
        "accuracy","precision","recall","f1-score",
        "tp","tn","fp","fn"]
    feature_selection_results[model] = {}
    for ml_model, ml_results in results.items(): 
        

        if ml_model in models_names:
            data.append(get_row(ml_results))



    path = f"{PLOTS_FEATURE_SELECTION_PATH}result/"
    if not os.path.exists(path):
        os.makedirs(path)
    print(data)
    plot_table(
        data,
        columns=columns, 
        rows=models_names,
        path=path+"metrics",
        title = (model+" Results"))
            


[['0.86 +- 0.0', '0.71 +- 0.0', '0.884 +- 0.005', '0.79 +- 0.0', 0.884, 0.85, 0.823, 0.83], ['0.87 +- 0.0', '0.73 +- 0.0', '0.9 +- 0.0', '0.806 +- 0.005', 0.9, 0.86, 0.82, 0.828], ['0.772 +- 0.007', '0.59 +- 0.014', '0.782 +- 0.007', '0.67 +- 0.006', 0.782, 0.77, 0.844, 0.846]]
