# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [1]:
import os
import sys
import json
import pandas as pd

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best_features
from cross_validation import get_folds
from config import load_config

# 2. Reading Configs and Defining Globals

In [2]:
config = load_config("../config.yaml")

In [3]:
report_feature_selection = {}
report_models = {}
report_dataset_info = {}

# 3. Reading Normalized Datasets

In [4]:
df_list = []
for dataset in config.DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df_list.append(df)

In [5]:
for df,dataset in zip(df_list,config.DATASETS):
    if "raw dataset" not in report_dataset_info:
        report_dataset_info["raw dataset"]={}
    report_dataset_info["raw dataset"][dataset]=dataset_info_report(df)

# 4. Preprocessing Data

In [9]:
preprocessed_df_list = []
report_dataset_info["preprocessed_dataset"]={}
for df,dataset in zip(df_list,config.DATASETS):
    preprocessed_df = remove_sparse_columns(df)
    preprocessed_df = remove_sparse_rows(preprocessed_df)
    report_dataset_info["preprocessed_dataset"][dataset]=dataset_info_report(preprocessed_df)
    one_hot_encoded_df = one_hot_encoded(preprocessed_df)
    preprocessed_df_list.append(one_hot_encoded_df)

# 5. Applying Feature Selection and Machine Learning Models With Cross Validation

In [10]:
MODELS = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "SVM" : SVC(probability=True, class_weight='balanced')
}

In [11]:

report_feature_selection['select-k-best']={}

for df,dataset in zip(preprocessed_df_list,config.DATASETS):
    report_feature_selection['select-k-best'][dataset]={}
    report_models[dataset]={} 
    folds = get_folds(df,config.N_FOLDS)
    for index,fold in enumerate(folds):

        train_df,test_df=fold
        train_df = under_sampling(train_df,False)
        test_df = under_sampling(test_df,False)

        X_train = train_df.drop(["LABEL"],axis=1)
        y_train = train_df["LABEL"]
        X_test = test_df.drop(["LABEL"],axis=1)
        y_test = test_df["LABEL"]
        
        X_k_best = get_k_best_features(X_train,y_train.values.ravel(), config.N_FEATURES)
        report_feature_selection['select-k-best'][dataset][f"fold {index}"]=feature_selection_report(X_k_best.columns.to_list())

        models = {key: MODELS[key] for key in config.MODELS}
        for model in models.items():
            model[1].fit(X_train, y_train.values.ravel())
            y_predicted=model[1].predict(X_test)
            if model[0] not in report_models[dataset]:
                report_models[dataset][model[0]]={}
            report_models[dataset][model[0]][f"fold {index}"]=model_report(y_test,y_predicted)

  f = msb / msw


# 7. Save Report

In [13]:
path = config.REPORT_PATH
if not os.path.exists(path):
    os.makedirs(path)
    
with open(f"{path}models.json", 'w') as f:
    json.dump(report_models, f)

with open(f"{path}features.json", 'w') as f:
    json.dump(report_feature_selection, f)

with open(f"{path}info.json", 'w') as f:
    json.dump(report_dataset_info, f)