# URLs para obtenção dos datasets (Obtido dia 07/06/2021)
- ### Explicação dos datasets: https://repositoriodatasharingfapesp.uspdigital.usp.br/
- ### Dataset do hospital Albert Einstein: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/98 - (2020-06-30)
- ### Dataset do hospital Sirio Libanes: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/97 (2020-06-30)
- ### Dataset do hospital Beneficencia Portuguesa: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/101 (2021-04-28)
- ### Dataset do grupo Fleury: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/99 - (2020-06-30)
- ### Dataset do hospital das clinicas da faculdade de medicina da Universidade de São Paulo: https://repositoriodatasharingfapesp.uspdigital.usp.br/handle/item/100 - (2021-02-17)

# 1. Importing Libraries

In [2]:
import json
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table
from dataframe import read_dataset
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best, fs_results_filter
from cross_validation import get_folds
from config import load_config

# 2. Defining Constants and Globals

In [3]:
config = load_config("../config.yaml")

In [4]:
fs_report = {}

# 3. Reading Normalized Datasets

In [5]:
complete_data = []
for dataset in config.DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df["grupo"] = dataset
    complete_data.append(df)
raw_df = pd.concat(complete_data)


# 4. Reading Features Results

In [6]:
with open(config.REPORT_PATH+'features-result.json', 'r') as fp:
    features_report = json.load(fp)

# 5. Preprocess Data

In [7]:
preprocessed_df = remove_sparse_columns(raw_df)
preprocessed_df = remove_sparse_rows(preprocessed_df)
data_df = preprocessed_df.drop(["grupo"],axis=1)
one_hot_encoded_df = one_hot_encoded(data_df)
df = one_hot_encoded_df
df["grupo"] = preprocessed_df["grupo"]


# 6. Analyse Best threshold for Feature Selection

In [8]:
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "SVM" : SVC(probability=True, class_weight='balanced')
}

In [9]:
fs_analysis = {}
for fs_model, fs_results in features_report.items():
    print(f"FS Model: {fs_model}")
    if fs_model not in fs_analysis:
        fs_analysis[fs_model]={}
    for n_features in range(min(config.N_FEATURES_THRESHOLD,(len(df.columns)-2))): 
        fs_filtered_results = list(fs_results_filter(fs_results["features-result"],n_features+1))
        fs_filtered_results.extend(["LABEL","grupo"])
        columns = [x for x in df.columns if x in fs_filtered_results]
        grouped = df.groupby(df.grupo)
        dfs = [grouped.get_group(dataset) for dataset in config.DATASETS]
        fs_dfs = [df[columns] for df in dfs]
        for fold_df,dataset in zip(fs_dfs,config.DATASETS):
            print(f"     Dataset: {dataset}")
            for model_name, model in models.items():
                if model_name in config.MODELS:
                    threshold = f"{n_features+1} features"
                    if model_name not in fs_analysis[fs_model]:
                        fs_analysis[fs_model][model_name]={}
                    if dataset not in fs_analysis[fs_model][model_name]:
                        fs_analysis[fs_model][model_name][dataset]={}
                    if threshold not in fs_analysis[fs_model][model_name][dataset]:
                        fs_analysis[fs_model][model_name][dataset][threshold]={}
                    print(f"          Applying model: {model_name} for {n_features+1} features")
                    train = [df_train for df_train in fs_dfs if not df_train.equals(fold_df)]
                    train_df = under_sampling(pd.concat(train),False)
                    test_df = under_sampling(fold_df,False)

                    X_train = train_df.drop(["LABEL","grupo"],axis=1)
                    y_train = train_df["LABEL"]
                    X_test = test_df.drop(["LABEL","grupo"],axis=1)
                    y_test = test_df["LABEL"]
                
                    model.fit(X_train, y_train.values.ravel())
                    y_predicted=model.predict(X_test)
                    
                    fs_analysis[fs_model][model_name][dataset][threshold]=model_report(y_test,y_predicted)
    print("\n")

FS Model: select-k-best-f-classif
     Dataset: concatenated-dataset
          Applying model: Decision Tree for 1 features
          Applying model: Random Forest for 1 features
          Applying model: KNN for 1 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 1 features
          Applying model: SVM for 1 features
     Dataset: albert-einstein
          Applying model: Decision Tree for 1 features
          Applying model: Random Forest for 1 features
          Applying model: KNN for 1 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 1 features
          Applying model: SVM for 1 features
     Dataset: beneficencia-portuguesa
          Applying model: Decision Tree for 1 features
          Applying model: Random Forest for 1 features
          Applying model: KNN for 1 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 1 features
          Applying model: SVM for 1 features
     Dataset: hospital-de-clinicas
          Applying model: Decision Tree for 1 features
          Applying model: Random Forest for 1 features
          Applying model: KNN for 1 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 1 features
          Applying model: SVM for 1 features
     Dataset: sirio-libanes
          Applying model: Decision Tree for 1 features
          Applying model: Random Forest for 1 features
          Applying model: KNN for 1 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 1 features
          Applying model: SVM for 1 features
     Dataset: grupo-fleury
          Applying model: Decision Tree for 1 features
          Applying model: Random Forest for 1 features
          Applying model: KNN for 1 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 1 features
          Applying model: SVM for 1 features
     Dataset: concatenated-dataset
          Applying model: Decision Tree for 2 features
          Applying model: Random Forest for 2 features
          Applying model: KNN for 2 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 2 features
          Applying model: SVM for 2 features
     Dataset: albert-einstein
          Applying model: Decision Tree for 2 features
          Applying model: Random Forest for 2 features
          Applying model: KNN for 2 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 2 features
          Applying model: SVM for 2 features
     Dataset: beneficencia-portuguesa
          Applying model: Decision Tree for 2 features
          Applying model: Random Forest for 2 features
          Applying model: KNN for 2 features


  _warn_prf(average, modifier, msg_start, len(result))


          Applying model: Logistic Regression for 2 features
          Applying model: SVM for 2 features


# 7. Save Feature Selection Report

In [None]:
path = f"{config.REPORT_PATH}"

with open(f"{path}feature-analysis.json", 'w') as f:
    json.dump(fs_analysis, f)

NameError: name 'config' is not defined