### URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

In [11]:
# Import dependencies
import os
import sys
import json
import pandas as pd
from tqdm.notebook import tqdm
from pandas_profiling import ProfileReport

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from dataframe import read_dataset, split_train_test
from report import dataset_info_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best, get_extra_trees_classifier, get_rfe
from cross_validation import get_folds
from config import load_config

### 1. Configs:

In [12]:
# Read configd snd define globals
config = load_config("../config.yaml")

report_feature_selection = {}
report_dataset_info = {}

### 2. Dataset:

In [13]:
# Read normalized datasets
df_list = []
for dataset in config.DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df_list.append(df)
    
for df,dataset in zip(df_list,config.DATASETS):
    if "raw dataset" not in report_dataset_info:
        report_dataset_info["raw dataset"]={}
    report_dataset_info["raw dataset"][dataset]=dataset_info = {
        "shape": {"instances": df.shape[0], "features": df.shape[1]},
    }

### 3. Preprocess:

In [14]:
# Apply preprocess to datasets
preprocessed_df_list = []
report_dataset_info["preprocessed_dataset"]={}
for df,dataset in zip(df_list,config.DATASETS):
    preprocessed_df = remove_sparse_columns(df)
    preprocessed_df = remove_sparse_rows(preprocessed_df)
    report_dataset_info["preprocessed_dataset"][dataset]=dataset_info = {
        "shape": {"instances": preprocessed_df.shape[0], "features": preprocessed_df.shape[1]},}
    one_hot_encoded_df = one_hot_encoded(preprocessed_df)
    preprocessed_df_list.append(one_hot_encoded_df)

### 4. Feature Selection

In [18]:
# Apply feature selection in a cross validation
for fs_model in config.FS_MODELS:
    report_feature_selection[fs_model]={}
for df,dataset in tqdm(zip(preprocessed_df_list,config.DATASETS),total=len(config.DATASETS),desc="Analyzing best features for each datasets with 10 folds cross validation"):
    for fs_model in config.FS_MODELS:
        report_feature_selection[fs_model][dataset]={}
    if dataset == "concatenated-dataset":
        prof = ProfileReport(df.drop(["LABEL","grupo"],axis=1))
        prof.to_file(output_file=f"{config.ANALYSIS_PATH}-features-analysis.html")
    folds = get_folds(df,config.N_FOLDS)
    best_features = []
    for index,fold in tqdm(enumerate(folds),total=len(folds),desc="- - Applying feature selection models to each fold"):
    # for index,fold in enumerate(folds):
        train_df,test_df=fold
        X_train, y_train, X_test, y_test = split_train_test(train_df,test_df)
        for fs_model in tqdm(config.FS_MODELS,total=len(config.FS_MODELS),desc="- - - - Applying feature selection models"):
        # for fs_model in config.FS_MODELS:
            k_best = get_k_best(df,X_train,y_train.values.ravel(), fs_model, None)
            report_feature_selection[fs_model][dataset][f"fold {index}"]=k_best

Analyzing best features for each datasets with 10 folds cross validation:   0%|          | 0/6 [00:00<?, ?it/s…

- - Applying feature selection models to each fold:   0%|          | 0/10 [00:00<?, ?it/s]

- - - - Applying feature selection models:   0%|          | 0/8 [00:00<?, ?it/s]

- - - - Applying feature selection models:   0%|          | 0/8 [00:00<?, ?it/s]

### 5. Save

In [None]:
# Save results
path = config.REPORT_PATH
if not os.path.exists(path):
    os.makedirs(path)

with open(f"{path}best-features.json", 'w') as f:
    json.dump(report_feature_selection, f)

with open(f"{path}info-dataset.json", 'w') as f:
    json.dump(report_dataset_info, f)