# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [8]:
import os
import sys
import json
import pandas as pd

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best_features
from cross_validation import get_folds

# 2. Defining Constants and Globals

In [9]:
DATASETS_PATH = "../dataset/"
REPORT_PATH = "../reports/complete-dataset/"
DATASETS = [
    "albert-einstein",
    "beneficencia-portuguesa",
    "hospital-de-clinicas",
    "sirio-libanes",
    "grupo-fleury"
    ]

SPARSE_COLUMN = 75
SPARSE_ROW = 75
N_FOLDS=5
N_FEATURES = 10

report_feature_selection = {}
report_models = {}
report_dataset_info = {}

# 3. Reading Normalized Datasets

In [10]:
complete_data = []
for dataset in DATASETS:
    complete_data.append(read_dataset(dataset))
raw_df = pd.concat(complete_data)
df = raw_df.drop("ID_PACIENTE",axis=1)
report_dataset_info["raw_dataset"]=dataset_info_report(df)

# 4. Preprocessing Data

In [11]:
preprocessed_df = remove_sparse_columns(df)
preprocessed_df = remove_sparse_rows(preprocessed_df)
# preprocessed_df = under_sampling(preprocessed_df)
report_dataset_info["preprocessed_dataset"]=dataset_info_report(preprocessed_df)
one_hot_encoded_df = one_hot_encoded(preprocessed_df)

# 5. Defining Machine Learning Models

In [12]:
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    # "Logistic Regression" : LogisticRegression(max_iter=1000),
    # "SVM" : SVC(probability=True, class_weight='balanced')
}

# 6. Applying Feature Selection and Machine Learning Models With Cross Validation

In [6]:
report_feature_selection["feature selection"]={}
report_feature_selection["feature selection"]['select-k-best']={}
report_models["models"]={}
folds = get_folds(one_hot_encoded_df,N_FOLDS)
for index,fold in enumerate(folds):
    train_df,test_df=fold
    train_df = under_sampling(train_df,False)
    test_df = under_sampling(test_df,False)
    X_train = train_df.drop(["LABEL"],axis=1)
    y_train = train_df["LABEL"]
    X_test = test_df.drop(["LABEL"],axis=1)
    y_test = test_df["LABEL"]
    X_k_best = get_k_best_features(X_train,y_train.values.ravel(), N_FEATURES)
    report_feature_selection["feature selection"]['select-k-best'][f"fold {index}"]=feature_selection_report(X_k_best.columns.to_list())
    for model in models.items():
        model[1].fit(X_train, y_train.values.ravel())
        y_predicted=model[1].predict(X_test)
        if model[0] not in report_models["models"]:
            report_models["models"][model[0]]={}
        report_models["models"][model[0]][f"fold {index}"]=model_report(y_test,y_predicted)

# 7. Save Report

In [13]:
path = REPORT_PATH
if not os.path.exists(path):
    os.makedirs(path)
    
with open(REPORT_PATH+'models.json', 'w') as f:
    json.dump(report_models, f)

with open(REPORT_PATH+'feature-selection.json', 'w') as f:
    json.dump(report_feature_selection, f)

with open(REPORT_PATH+'info.json', 'w') as f:
    json.dump(report_dataset_info, f)