# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [13]:
import json
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table
from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best_features
from cross_validation import get_folds

# 2. Defining Constants and Globals

In [14]:
DATASETS_PATH = "../dataset/"
COMPLETE_DATASET_REPORT_PATH = "../reports/complete-dataset/"
EACH_DATASET_REPORT_PATH = "../reports/each-dataset/"
INTERIM_REPORT_PATH = "../reports/interim/"
PLOTS_PATH = "../plots/"
REPORT_PATH = "../reports/"
DATASETS = [
    "albert-einstein",
    "beneficencia-portuguesa",
    "hospital-de-clinicas",
    "sirio-libanes",
    "grupo-fleury",
    ]

K_FOLDS = 5

N_FEATURES = 10
FS_THRESHOLD = 0.25

fs_report = {}

# 3. Auxiliar Functions

In [15]:
# def fs_results_filter(fs_results):
#     total = K_FOLDS*len(DATASETS)
#     fs_filtered_results = []
#     for feature,times in fs_results.items():
#         if times > total * FS_THRESHOLD:
#             fs_filtered_results.append[feature]
#         else:
#             return fs_filtered_results

def fs_results_filter(fs_results):
    n_features = N_FEATURES
    fs_filtered_results = []
    ultimo = 0
    for feature,times in fs_results.items():
        if  n_features>0 or times==ultimo:
            fs_filtered_results.append(feature)
            n_features-=1
            if n_features == 0:
                ultimo = times
        else:
            return fs_filtered_results


# 4. Reading Normalized Datasets

In [16]:
complete_data = []
for dataset in DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df["grupo"] = dataset
    complete_data.append(df)


# 5. Reading Interim

In [17]:
with open(INTERIM_REPORT_PATH+'features.json', 'r') as fp:
    features_report = json.load(fp)["results"]

# 6. Preprocess Data

In [18]:
preprocessed_data=[]
for df in complete_data:
    preprocessed_df = remove_sparse_columns(df)
    preprocessed_df = remove_sparse_rows(preprocessed_df)
    one_hot_encoded_df = one_hot_encoded(preprocessed_df)
    preprocessed_data.append(one_hot_encoded_df)

# 7. Apply models

In [19]:
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    # "Logistic Regression" : LogisticRegression(max_iter=1000),
    # "SVM" : SVC(probability=True, class_weight='balanced')
}

In [44]:
for fs_model, fs_results in features_report.items():
    if fs_model not in fs_report:
        fs_report[fs_model]={}
    # fs_filtered_results = fs_results_filter(fs_results)
    fs_filtered_results = list(fs_results.keys())
    fs_filtered_results.append("LABEL")
    fs_report[fs_model]["N Features"] = len(fs_filtered_results)
    fs_report[fs_model]["Features"] = fs_filtered_results
    for df,dataset in zip(preprocessed_data,DATASETS):
        columns = [x for x in df.columns if x in fs_filtered_results]
        fs_dfs = df[columns]
        for model_name, model in models.items():
            if model_name not in fs_report[fs_model]:
                fs_report[fs_model][model_name]={}
            if dataset not in fs_report[fs_model][model_name]:
                fs_report[fs_model][model_name][dataset]={}
            train = [df_train for df_train in preprocessed_data if not df_train.equals(df)]
            print(len(train))
            train_df = pd.concat(train)
            train_df.fillna(0, inplace=True)

            print(train_df.shape)
            X_train = train_df.drop(["LABEL"],axis=1)
            y_train = train_df["LABEL"]
            X_test = df.drop(["LABEL"],axis=1)
            y_test = df["LABEL"]
            
            print(X_train)
            # X = df.drop(["LABEL"],axis=1)
            # y = df["LABEL"]
            # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            model.fit(X_train, y_train.values.ravel())
            y_predicted=model.predict(X_test)
            
            fs_report[fs_model][model_name][dataset]=model_report(y_test,y_predicted)                


4
(47225, 86)
(47225, 85)
(47225,)


Feature names unseen at fit time:
- tgp_BAIXO
Feature names seen at fit time, yet now missing:
- bilirrubina total_BAIXO
- fosforo_ALTO
- fosforo_BAIXO
- fosforo_NORMAL
- hemograma hematocrito_ALTO
- ...



ValueError: X has 79 features, but DecisionTreeClassifier is expecting 85 features as input.

# 8. Save Feature Selection Report

In [None]:
path = f"{INTERIM_REPORT_PATH}"

with open(path+'feature-selection-results.json', 'w') as f:
    json.dump(fs_report, f)