# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [1]:
import json
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table
from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best_features
from cross_validation import get_folds
from config import load_config

# 2. Defining Constants and Globals

In [2]:
config = load_config("../config.yaml")

In [3]:
fs_report = {}

# 3. Auxiliar Functions

In [4]:
# def fs_results_filter(fs_results):
#     total = K_FOLDS*len(DATASETS)
#     fs_filtered_results = []
#     for feature,times in fs_results.items():
#         if times > total * FS_THRESHOLD:
#             fs_filtered_results.append[feature]
#         else:
#             return fs_filtered_results

def fs_results_filter(fs_results):
    n_features = config.N_FEATURES
    fs_filtered_results = []
    ultimo = 0
    for feature,times in fs_results.items():
        if  n_features>0 or times==ultimo:
            fs_filtered_results.append(feature)
            n_features-=1
            if n_features == 0:
                ultimo = times
        else:
            return fs_filtered_results


# 4. Reading Normalized Datasets

In [5]:
complete_data = []
for dataset in config.DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df["grupo"] = dataset
    complete_data.append(df)
raw_df = pd.concat(complete_data)


# 5. Reading Features Results

In [6]:
with open(config.REPORT_PATH+'features-result.json', 'r') as fp:
    features_report = json.load(fp)

# 6. Preprocess Data

In [7]:
preprocessed_df = remove_sparse_columns(raw_df)
preprocessed_df = remove_sparse_rows(preprocessed_df)
data_df = preprocessed_df.drop(["grupo"],axis=1)
one_hot_encoded_df = one_hot_encoded(data_df)
df = one_hot_encoded_df
df["grupo"] = preprocessed_df["grupo"]


# 7. Apply models

In [8]:
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "SVM" : SVC(probability=True, class_weight='balanced')
}

In [9]:
for fs_model, fs_results in features_report.items():
    if fs_model not in fs_report:
        fs_report[fs_model]={}
    # fs_filtered_results = fs_results_filter(fs_results)
    fs_filtered_results = list(fs_results["features-result"].keys())
    fs_filtered_results.append("LABEL")
    fs_report[fs_model]["N Features"] = len(fs_filtered_results)
    fs_report[fs_model]["Features"] = fs_filtered_results
    grouped = df.groupby(df.grupo)
    dfs = [grouped.get_group(dataset) for dataset in config.DATASETS]

    for fold_df,dataset in zip(dfs,config.DATASETS):
        columns = [x for x in df.columns if x in fs_filtered_results]
        fs_dfs = fold_df[columns]
        for model_name, model in models.items():
            if model_name in config.MODELS:
                print(model_name)
                if model_name not in fs_report[fs_model]:
                    fs_report[fs_model][model_name]={}
                if dataset not in fs_report[fs_model][model_name]:
                    fs_report[fs_model][model_name][dataset]={}
                train = [df_train for df_train in dfs if not df_train.equals(df)]
                print("testing:",dataset)
                train_df = under_sampling(pd.concat(train),False)
                test_df = under_sampling(fold_df,False)
                # train_df.fillna(0, inplace=True)

                X_train = train_df.drop(["LABEL","grupo"],axis=1)
                y_train = train_df["LABEL"]
                X_test = df.drop(["LABEL","grupo"],axis=1)
                y_test = df["LABEL"]
            
                model.fit(X_train, y_train.values.ravel())
                y_predicted=model.predict(X_test)
                
                fs_report[fs_model][model_name][dataset]=model_report(y_test,y_predicted)                


Decision Tree
testing: concatenated-dataset
Random Forest
testing: concatenated-dataset
KNN
testing: concatenated-dataset
Decision Tree
testing: albert-einstein
Random Forest
testing: albert-einstein
KNN
testing: albert-einstein
Decision Tree
testing: beneficencia-portuguesa
Random Forest
testing: beneficencia-portuguesa
KNN
testing: beneficencia-portuguesa
Decision Tree
testing: hospital-de-clinicas
Random Forest
testing: hospital-de-clinicas
KNN
testing: hospital-de-clinicas
Decision Tree
testing: sirio-libanes
Random Forest
testing: sirio-libanes
KNN
testing: sirio-libanes
Decision Tree
testing: grupo-fleury
Random Forest
testing: grupo-fleury
KNN
testing: grupo-fleury


# 8. Save Feature Selection Report

In [11]:
path = f"{config.REPORT_PATH}"

with open(f"{path}final-results.json", 'w') as f:
    json.dump(fs_report, f)