# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [11]:
import json
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

from pandas_profiling import ProfileReport

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table
from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best, fs_results_filter
from cross_validation import get_folds
from config import load_config

# 2. Defining Constants and Globals

In [12]:
config = load_config("../config.yaml")

In [13]:
fs_report = {}

# 3. Auxiliar Functions

In [14]:
def get_features_analysis(df,fs_model_name,columns):
    path = f"{config.ANALYSIS_PATH}"
    if not os.path.exists(path):
        os.makedirs(path)
    concatenated_dataset_df = df[df["grupo"]=="concatenated-dataset"]
    concatenated_dataset_df = concatenated_dataset_df[columns]
    concatenated_dataset_df.reset_index(drop=True, inplace=True)
    prof = ProfileReport(concatenated_dataset_df.drop(["LABEL","grupo"],axis=1))
    prof.to_file(output_file=f"{path}{fs_model_name}-features-analysis.html")

# 4. Reading Normalized Datasets

In [15]:
complete_data = []
for dataset in config.DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df["grupo"] = dataset
    complete_data.append(df)
raw_df = pd.concat(complete_data)


# 5. Reading Features Results

In [16]:
with open(config.REPORT_PATH+'features-result.json', 'r') as fp:
    features_report = json.load(fp)

# 6. Preprocess Data

In [17]:
preprocessed_df = remove_sparse_columns(raw_df)
preprocessed_df = remove_sparse_rows(preprocessed_df)
data_df = preprocessed_df.drop(["grupo"],axis=1)
one_hot_encoded_df = one_hot_encoded(data_df)
df = one_hot_encoded_df
df["grupo"] = preprocessed_df["grupo"]


# 7. Apply models

In [18]:
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "SVM" : SVC(probability=True, class_weight='balanced')
}

In [19]:
for fs_model, fs_results in features_report.items():
    if fs_model not in fs_report:
        fs_report[fs_model]={}
    fs_filtered_results = list(fs_results_filter(fs_results["features-result"],config.N_FEATURES_BEST_THRESHOLD))
    # fs_filtered_results = list(fs_results["features-result"].keys())
    fs_filtered_results.extend(["LABEL","grupo"])
    fs_report[fs_model]["N Features"] = len(fs_filtered_results)
    fs_report[fs_model]["Features"] = fs_filtered_results
    columns = [x for x in df.columns if x in fs_filtered_results]
    get_features_analysis(df,fs_model,columns)
    grouped = df.groupby(df.grupo)
    dfs = [grouped.get_group(dataset) for dataset in config.DATASETS]

    fs_dfs = [df[columns] for df in dfs]
    for fold_df,dataset in zip(fs_dfs,config.DATASETS):
        for model_name, model in models.items():
            if model_name in config.MODELS:
                if model_name not in fs_report[fs_model]:
                    fs_report[fs_model][model_name]={}
                if dataset not in fs_report[fs_model][model_name]:
                    fs_report[fs_model][model_name][dataset]={}
                train = [df_train for df_train in fs_dfs if not df_train.equals(fold_df)]
                train_df = under_sampling(pd.concat(train),False)
                test_df = under_sampling(fold_df,False)

                X_train = train_df.drop(["LABEL","grupo"],axis=1)
                y_train = train_df["LABEL"]
                X_test = test_df.drop(["LABEL","grupo"],axis=1)
                y_test = test_df["LABEL"]
            
                model.fit(X_train, y_train.values.ravel())
                y_predicted=model.predict(X_test)
                
                fs_report[fs_model][model_name][dataset]=model_report(y_test,y_predicted)

Summarize dataset: 100%|██████████| 48/48 [00:30<00:00,  1.58it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.26s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 108.15it/s]
Summarize dataset: 100%|██████████| 43/43 [00:23<00:00,  1.80it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.94s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 103.16it/s]
Summarize dataset: 100%|██████████| 43/43 [00:24<00:00,  1.79it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.90s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 114.98it/s]


# 8. Save Feature Selection Report

In [20]:
path = f"{config.REPORT_PATH}"

with open(f"{path}final-results.json", 'w') as f:
    json.dump(fs_report, f)