# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [13]:
import json
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table
from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best_features
from cross_validation import get_folds

# 2. Defining Constants and Globals

In [14]:
DATASETS_PATH = "../dataset/"
COMPLETE_DATASET_REPORT_PATH = "../reports/complete-dataset/"
EACH_DATASET_REPORT_PATH = "../reports/each-dataset/"
INTERIM_REPORT_PATH = "../reports/interim/"
PLOTS_PATH = "../plots/"
REPORT_PATH = "../reports/"
DATASETS = [
    "albert-einstein",
    "beneficencia-portuguesa",
    "hospital-de-clinicas",
    "sirio-libanes",
    "grupo-fleury",
    ]

K_FOLDS = 5

N_FEATURES = 10
FS_THRESHOLD = 0.25

fs_report = {}

# 3. Auxiliar Functions

In [15]:
# def fs_results_filter(fs_results):
#     total = K_FOLDS*len(DATASETS)
#     fs_filtered_results = []
#     for feature,times in fs_results.items():
#         if times > total * FS_THRESHOLD:
#             fs_filtered_results.append[feature]
#         else:
#             return fs_filtered_results

def fs_results_filter(fs_results):
    n_features = N_FEATURES
    fs_filtered_results = []
    ultimo = 0
    for feature,times in fs_results.items():
        if  n_features>0 or times==ultimo:
            fs_filtered_results.append(feature)
            n_features-=1
            if n_features == 0:
                ultimo = times
        else:
            return fs_filtered_results


# 4. Reading Normalized Datasets

In [16]:
complete_data = []
for dataset in DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df["grupo"] = dataset
    complete_data.append(df)
raw_df = pd.concat(complete_data)


# 5. Reading Interim

In [17]:
with open(INTERIM_REPORT_PATH+'features.json', 'r') as fp:
    features_report = json.load(fp)["results"]

# 6. Preprocess Data

In [18]:
preprocessed_df = remove_sparse_columns(raw_df)
preprocessed_df = remove_sparse_rows(preprocessed_df)
data_df = preprocessed_df.drop(["grupo"],axis=1)
one_hot_encoded_df = one_hot_encoded(data_df)
df = one_hot_encoded_df
df["grupo"] = preprocessed_df["grupo"]


# 7. Apply models

In [19]:
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    # "Logistic Regression" : LogisticRegression(max_iter=1000),
    # "SVM" : SVC(probability=True, class_weight='balanced')
}

In [20]:
for fs_model, fs_results in features_report.items():
    if fs_model not in fs_report:
        fs_report[fs_model]={}
    # fs_filtered_results = fs_results_filter(fs_results)
    fs_filtered_results = list(fs_results.keys())
    fs_filtered_results.append("LABEL")
    fs_report[fs_model]["N Features"] = len(fs_filtered_results)
    fs_report[fs_model]["Features"] = fs_filtered_results
    grouped = df.groupby(df.grupo)
    dfs = [grouped.get_group(dataset) for dataset in DATASETS]

    for fold_df,dataset in zip(dfs,DATASETS):
        columns = [x for x in df.columns if x in fs_filtered_results]
        fs_dfs = fold_df[columns]
        for model_name, model in models.items():
            if model_name not in fs_report[fs_model]:
                fs_report[fs_model][model_name]={}
            if dataset not in fs_report[fs_model][model_name]:
                fs_report[fs_model][model_name][dataset]={}
            train = [df_train for df_train in dfs if not df_train.equals(df)]
            
            train_df = under_sampling(pd.concat(train),False)
            test_df = under_sampling(fold_df,False)
            # train_df.fillna(0, inplace=True)

            X_train = train_df.drop(["LABEL","grupo"],axis=1)
            y_train = train_df["LABEL"]
            X_test = df.drop(["LABEL","grupo"],axis=1)
            y_test = df["LABEL"]
          
            model.fit(X_train, y_train.values.ravel())
            y_predicted=model.predict(X_test)
            
            fs_report[fs_model][model_name][dataset]=model_report(y_test,y_predicted)                


In [23]:
fs_report

{'select-k-best': {'N Features': 35,
  'Features': ['hemograma eosinofilos_NORMAL',
   'hemograma eosinofilos_BAIXO',
   'd dimero_NORMAL',
   'd dimero_ALTO',
   'tgp_ALTO',
   'hemograma linfocitos_BAIXO',
   'hemograma leucocitos_BAIXO',
   'hemograma leucocitos_ALTO',
   'hemograma eosinofilos_ALTO',
   'desidrogenase latica_ALTO',
   'bilirrubina direta_NORMAL',
   'desidrogenase latica_NORMAL',
   'tgo_ALTO',
   'bilirrubina total_NORMAL',
   'hemograma neutrofilos_NORMAL',
   'hemograma neutrofilos_BAIXO',
   'hemograma neutrofilos_ALTO',
   'hemograma monocitos_ALTO',
   'hemograma linfocitos_ALTO',
   'hemograma hemoglobina_NORMAL',
   'hemograma hemoglobina_BAIXO',
   'hemograma eritrocitos_BAIXO',
   'magnesio_BAIXO',
   'hemograma vcm_BAIXO',
   'hemograma linfocitos_NORMAL',
   'calcio ionizavel_BAIXO',
   'bilirrubina indireta_NORMAL',
   'ureia_NORMAL',
   'creatinina_NORMAL',
   'hemograma rdw_ALTO',
   'potassio_NORMAL',
   'hemograma vcm_NORMAL',
   'hemograma monocit

# 8. Save Feature Selection Report

In [22]:
path = f"{INTERIM_REPORT_PATH}"

with open(path+'feature-selection-results.json', 'w') as f:
    json.dump(fs_report, f)