In [1]:
import json
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from tqdm.notebook import tqdm

current_path = os.path.abspath(os.getcwd())
sys.path.append(f"{current_path}/../libs")

from plot import plot_confusion_matrix, plot_table
from dataframe import read_dataset, split_train_test
from report import dataset_info_report, model_report, feature_selection_report
from preprocess import remove_sparse_rows, remove_sparse_columns, under_sampling, one_hot_encoded
from feature_selection import get_k_best, fs_results_filter, get_best_features
from cross_validation import get_folds
from config import load_config

### 1. Configs

In [2]:
# Reading Configs and Defining Constants and Globals
config = load_config("../config.yaml")
fs_report = {}
model_results = {}

### 2. Datasets

In [3]:
# Reading Normalized Datasets
complete_data = []
for dataset in config.DATASETS:
    df = read_dataset(dataset).drop("ID_PACIENTE",axis=1)
    df["grupo"] = dataset
    complete_data.append(df)
raw_df = pd.concat(complete_data)


### 3. Reports

In [4]:
# Reading Features Results
with open(config.REPORT_PATH+'best-features-result.json', 'r') as fp:
    features_report = json.load(fp)

### 4. Preprocess

In [5]:
# Preprocess Datasets
preprocessed_df = remove_sparse_columns(raw_df)
preprocessed_df = remove_sparse_rows(preprocessed_df)
data_df = preprocessed_df.drop(["grupo"],axis=1)
one_hot_encoded_df = one_hot_encoded(data_df)
df = one_hot_encoded_df
df["grupo"] = preprocessed_df["grupo"]


### 5. Machine Learnin Models

In [6]:
# Models
models = {
    "Decision Tree" : tree.DecisionTreeClassifier(),
    "Random Forest" : RandomForestClassifier(random_state=42),
    "KNN" : KNeighborsClassifier(),
    "Logistic Regression" : LogisticRegression(max_iter=1000),
    "SVM" : SVC(probability=True, class_weight='balanced')
}

In [7]:
# Calculate Average For Models Result
def get_avg_folds(metrics):
    accuracy = 0
    precision = 0
    recall = 0
    f1 = 0
    tp = 0
    tn = 0
    for fold in metrics:
        accuracy += float(fold['accuracy'])
        precision += float(fold['precision'])
        recall += float(fold['recall'])
        f1 += float(fold['f1-score'])
        tp += float(fold['confusion matrix']['tp'])
        tn += float(fold['confusion matrix']['tn'])
    n=len(metrics)
    return {
        "accuracy":accuracy/n,
        "precision":precision/n,
        "recall":recall/n,
        "f1-score":f1/n,
        "confusion-matrix":{
            "tp":tp/n,
            "tn":tn/n,
            "fp":1-tp/n,
            "fn":1-tn/n,}
        }
    
    

### 6. Between Folds

In [8]:
#  Train and test models
# model_results["between-folds"]={}
# for fs_model, fs_model_results in tqdm(features_report.items(),total=len(features_report),desc=f"Analyzing feature selection models"):
#     model_results["between-folds"][fs_model]={}
#     total_features = fs_model_results["concatenated-dataset"]
#     n = len(total_features)
#     for ml_model, model in tqdm(models.items(),total=len(models),desc="Aplying machine learning models"):
#         if ml_model in config.MODELS:
#             model_results["between-folds"][fs_model][ml_model]={}
#             for n_features in tqdm(range(n),total=n,desc=f"Variating the number of features"):
#                 if n_features in [0,1,2,3] or (n_features+1) % 5 == 0:
#                     features = total_features[:n_features+1]
#                     features.extend(["LABEL","grupo"])
#                     columns = [x for x in df.columns if x in features]
#                     grouped = df.groupby(df.grupo)
#                     complete_df_list = [grouped.get_group(group) for group in config.DATASETS]
#                     df_list = [df[columns] for df in complete_df_list][1:]
#                     models_metrics = []
#                     for fold,fold_name in zip(df_list,config.DATASETS[1:]):
#                         train = [df_train for df_train in df_list if not df_train.equals(fold)]
#                         train_df = under_sampling(pd.concat(train),False)
#                         test_df = under_sampling(fold,False)
#                         X_train, y_train, X_test, y_test = split_train_test(train_df,test_df)
#                         model.fit(X_train, y_train.values.ravel())
#                         y_predicted=model.predict(X_test)
#                         models_metrics.append(model_report(y_test,y_predicted))
#                     model_results["between-folds"][fs_model][ml_model][f"{n_features+1} features"] = get_avg_folds(models_metrics)

### 7. Between Datasets

In [9]:
#  Train and test models
model_results["between-datasets"]={}
for fs_model, fs_model_results in tqdm(features_report.items(),total=len(features_report),desc=f"Analyzing feature selection models"):
    model_results["between-datasets"][fs_model]={}
    best_features_list = [features for dataset,features in fs_model_results.items() if dataset != "concatenated-dataset"]
    total_features = get_best_features(best_features_list)
    n = len(total_features)
    for ml_model, model in tqdm(models.items(),total=len(models),desc="Aplying machine learning models"):
        if ml_model in config.MODELS:
            model_results["between-datasets"][fs_model][ml_model]={}
            for n_features in tqdm(range(n),total=n,desc=f"Variating the number of features"):
                if n_features in [0,1,2,3] or (n_features+1) % 5 == 0:
                    features = total_features[:n_features+1]
                    features.extend(["LABEL","grupo"])
                    columns = [x for x in df.columns if x in features]
                    grouped = df.groupby(df.grupo)
                    complete_df_list = [grouped.get_group(group) for group in config.DATASETS]
                    df_list = [df[columns] for df in complete_df_list][1:]
                    models_metrics = []
                    for fold,fold_name in zip(df_list,config.DATASETS[1:]):
                        train = [df_train for df_train in df_list if not df_train.equals(fold)]
                        train_df = under_sampling(pd.concat(train),False)
                        test_df = under_sampling(fold,False)
                        X_train, y_train, X_test, y_test = split_train_test(train_df,test_df)
                        model.fit(X_train, y_train.values.ravel())
                        y_predicted=model.predict(X_test)
                        models_metrics.append(model_report(y_test,y_predicted))
                    model_results["between-datasets"][fs_model][ml_model][f"{n_features+1} features"] = get_avg_folds(models_metrics)

Analyzing feature selection models:   0%|          | 0/8 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/5 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/86 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/86 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/86 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### 8. Between Methods

In [None]:
#  Train and test models
model_results["between-methods"]={}
features_list = {}
for fs_model, fs_model_results in features_report.items():
    for dataset, dataset_result in fs_model_results.items():
        
        # if dataset != "concatenated-dataset":
        if dataset not in features_list:
            features_list[dataset] = []
        features_list[dataset].append(dataset_result)
for dataset, dataset_results in tqdm(features_list.items(),total=len(features_list),desc=f"Analyzing datasets"):
    model_results["between-methods"][dataset]={}
    total_features = get_best_features(dataset_results)
    n = len(total_features)
    for ml_model, model in tqdm(models.items(),total=len(models),desc="Aplying machine learning models"):
        if ml_model in config.MODELS:
            model_results["between-methods"][dataset][ml_model]={}
            for n_features in tqdm(range(n),total=n,desc=f"Variating the number of features"):
                if n_features in [0,1,2,3] or (n_features+1) % 5 == 0:
                    features = total_features[:n_features+1]
                    features.extend(["LABEL","grupo"])
                    columns = [x for x in df.columns if x in features]
                    grouped = df.groupby(df.grupo)
                    complete_df_list = [grouped.get_group(group) for group in config.DATASETS]
                    df_list = [df[columns] for df in complete_df_list][1:]
                    models_metrics = []
                    for fold,fold_name in zip(df_list,config.DATASETS[1:]):
                        train = [df_train for df_train in df_list if not df_train.equals(fold)]
                        train_df = under_sampling(pd.concat(train),False)
                        test_df = under_sampling(fold,False)
                        X_train, y_train, X_test, y_test = split_train_test(train_df,test_df)
                        model.fit(X_train, y_train.values.ravel())
                        y_predicted=model.predict(X_test)
                        models_metrics.append(model_report(y_test,y_predicted))
                    model_results["between-methods"][dataset][ml_model][f"{n_features+1} features"] = get_avg_folds(models_metrics)

Analyzing datasets:   0%|          | 0/6 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/4 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/80 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/80 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/80 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Variating the number of features:   0%|          | 0/80 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/4 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/4 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Variating the number of features:   0%|          | 0/79 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/4 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/75 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/75 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/75 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Variating the number of features:   0%|          | 0/75 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/4 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/82 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/82 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/82 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Variating the number of features:   0%|          | 0/82 [00:00<?, ?it/s]

Aplying machine learning models:   0%|          | 0/4 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/61 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/61 [00:00<?, ?it/s]

Variating the number of features:   0%|          | 0/61 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Variating the number of features:   0%|          | 0/61 [00:00<?, ?it/s]

### 6. Save

In [None]:
# Save Report
path = f"{config.REPORT_PATH}"

# with open(f"{path}models_result.json", 'w') as f:
with open(f"{path}models-result-new.json", 'w') as f:
    json.dump(model_results, f)