<a href="https://colab.research.google.com/github/pszemraj/ml4hc-s22-project01/blob/start-ensemble/notebooks/colab/ensemble/Compile_Trained_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analyze Trained Results


# setup

In [1]:
!nvidia-smi

Mon Mar 28 02:32:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
#@markdown add auto-Colab formatting with `IPython.display`
from IPython.display import HTML, display
# colab formatting
def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )

get_ipython().events.register("pre_run_cell", set_css)

In [3]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from tensorflow.keras import optimizers, losses, activations, models
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.layers import Dense, Input, Dropout, Convolution1D, MaxPool1D, GlobalMaxPool1D, GlobalAveragePooling1D,concatenate,Flatten,\
Dense,Dropout,LSTM,Masking,Bidirectional,Dropout,GRU,SimpleRNN,TimeDistributed, BatchNormalization, Activation, MaxPooling1D, GlobalMaxPooling1D, Conv1D
from keras.models import Sequential,Model
import h5py
from sklearn.metrics import f1_score,accuracy_score, roc_auc_score, average_precision_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import collections


In [4]:
#@title mount drive
from pathlib import Path
from google.colab import drive

drive_base_str = '/content/drive'
drive.mount(drive_base_str)


Mounted at /content/drive


In [5]:
#@markdown determine root
import os
from pathlib import Path
peter_base = Path('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/')

if peter_base.exists() and peter_base.is_dir():
    path = str(peter_base.resolve())
else:
    # original
    path = '/content/drive/MyDrive/ETH/'

print(f"base drive dir is:\n{path}")

base drive dir is:
/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1


## define folder for outputs

In [6]:
_out_dir_name = "Trained-Results-Analysis" #@param {type:"string"}

output_path = os.path.join(path, _out_dir_name)
os.makedirs(output_path, exist_ok=True)
print(f"notebook outputs will be stored in:\n{output_path}")

notebook outputs will be stored in:
/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/Trained-Results-Analysis


##load data

In [7]:
def load_mitbih(base_path):
    df_train = pd.read_csv(os.path.join(base_path,"data/mitbih_train.csv"),
                           header=None)
    df_train = df_train.sample(frac=1)
    df_test = pd.read_csv(os.path.join(base_path,"data/mitbih_test.csv"),
                          header=None)

    Y = np.array(df_train[187].values).astype(np.int8)
    X = np.array(df_train[list(range(187))].values)[..., np.newaxis]

    Y_test = np.array(df_test[187].values).astype(np.int8)
    X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

    return X, X_test, Y, Y_test

In [8]:
backup_peter = '/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/project-handouts/'

In [9]:
try:
    X, X_test, Y, Y_test = load_mitbih(base_path=path)
except Exception as e:
    print(f"unable to load data from base path in drive folder because:\n\t{e}")
    print(f"\ngoing to try backup:\n\t{backup_peter}")
    X, X_test, Y, Y_test = load_mitbih(base_path=backup_peter)

## load directories with trained models

In [10]:
import pprint as pp
project_root = Path(path)
weight_dirs = [d for d in project_root.iterdir() if d.is_dir() and "weight" in d.name]

pp.pprint(weight_dirs)

[PosixPath('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/MITBIH_weights'),
 PosixPath('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/PTB_weights'),
 PosixPath('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/MITBIH_biGRU_weights')]


# get predictions for all MITBIH models

In [26]:
mit_out_path = Path(output_path) / "MIT_ensemble"
mit_out_path.mkdir(exist_ok=True)

In [12]:
MIT_df = pd.DataFrame(Y_test, columns=['actual_class'])
MIT_df.head()

Unnamed: 0,actual_class
0,0
1,0
2,0
3,0
4,0


In [13]:
mitbih_dirs = [d for d in weight_dirs if "mitbih" in d.name.lower()]
pp.pprint(mitbih_dirs)


[PosixPath('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/MITBIH_weights'),
 PosixPath('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/MITBIH_biGRU_weights')]


In [14]:
mitbih_fitted_models = []

for weight_dir in mitbih_dirs:

    model_paths = [f for f in weight_dir.iterdir() if f.is_file() and f.suffix == ".h5"]
    mitbih_fitted_models.extend(model_paths)

print(f"found {len(mitbih_fitted_models)} total models to compute preds for MIT")


found 18 total models to compute preds for MIT


### loop through all files, store filename as column

In [16]:
import gc
from tqdm.auto import tqdm
from keras.models import load_model

pbar = tqdm(desc="computing model predictions", 
            total=len(mitbih_fitted_models)
        )

for mpath in mitbih_fitted_models:

    try:
        model_name = mpath.stem
        model = load_model(mpath)
        pred_test = model.predict(X_test)
        pred_test = np.argmax(pred_test, axis=-1)

        MIT_df[f"{model_name}_preds"] = pred_test

        del model
        del pred_test
    except Exception as e:
        print(f"\nUnable to generate predictions for {mpath.name}, skipping")
        print(f"Error printout as follows:{e}")
    gc.collect()
    pbar.update(1)
pbar.close()

computing model predictions:   0%|          | 0/18 [00:00<?, ?it/s]

In [17]:

mit_df_base = mit_out_path / "MITBIH_testset_model_predictions"

MIT_df.to_csv(mit_df_base.with_suffix('.csv'), index=False)
MIT_df.to_excel(mit_df_base.with_suffix('.xlsx'), index=False)

In [18]:
MIT_df.head()

Unnamed: 0,actual_class,BIRNN10_mitbih_preds,BILSTM187_mitbih_preds,BILSTM187_ptb_preds,BIRNN187_mitbih_preds,BILSTM_mitbih_preds,CNN_mitbih_preds,LTSM_mitbih_preds,BidirGRU_preds,RNN_mitbih_preds,GRU_mitbih_preds,SIMPLE_RNN_mitbih_preds,BidirGRU_BS=1024_preds,BILSTM10_mitbih_preds
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,4,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,2,0,0,0,0,0,0,0,0,0,0,0,0


# PTB

In [47]:
def load_ptb(base_path):
    df_1 = pd.read_csv(os.path.join(base_path,"data/ptbdb_normal.csv"),
                           header=None)
    df_2 = pd.read_csv(os.path.join(base_path,"data/ptbdb_abnormal.csv"),
                          header=None)
    df = pd.concat([df_1, df_2])

    df_train, df_test = train_test_split(df, test_size=0.2, random_state=1337, stratify=df[187])

    Y = np.array(df_train[187].values).astype(np.int8)
    X = np.array(df_train[list(range(187))].values)[..., np.newaxis]

    Y_test = np.array(df_test[187].values).astype(np.int8)
    X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]
    

    return X, X_test, Y, Y_test

In [48]:
try:
    X_ptb, X_test_ptb, Y_ptb, Y_test_ptb = load_ptb(base_path=path)
except Exception as e:
    print(f"unable to load data from base path in drive folder because:\n\t{e}")
    print(f"\ngoing to try backup:\n\t{backup_peter}")
    X_ptb, X_test_ptb, Y_ptb, Y_test_ptb = load_ptb(base_path=backup_peter)

In [49]:
X_test_ptb.shape

(2911, 187, 1)

In [50]:
ptb_out_path = Path(output_path) / "PTB_ensemble"
ptb_out_path.mkdir(exist_ok=True)

In [51]:
PTB_df = pd.DataFrame(Y_test_ptb, columns=['actual_class'])
PTB_df.head()

Unnamed: 0,actual_class
0,0
1,1
2,0
3,1
4,1


In [52]:
PTB_df.shape

(2911, 1)

In [53]:
ptb_dirs = [d for d in weight_dirs if "ptb" in d.name.lower()]
pp.pprint(ptb_dirs)


[PosixPath('/content/drive/MyDrive/ETHZ-2022-S/ML-healthcare-projects/project1/PTB_weights')]


In [54]:
ptb_fitted_models = []

for weight_dir in ptb_dirs:

    model_paths = [f for f in weight_dir.iterdir() if f.is_file() and f.suffix == ".h5"]
    ptb_fitted_models.extend(model_paths)

print(f"found {len(ptb_fitted_models)} total models to compute preds for MIT")


found 5 total models to compute preds for MIT


### loop through all files, store filename as column

In [55]:
import gc
from tqdm.auto import tqdm
from keras.models import load_model

pbar = tqdm(desc="computing model predictions", 
            total=len(ptb_fitted_models)
        )

for mpath in ptb_fitted_models:

    try:
        model_name = mpath.stem
        model = load_model(mpath)
        PTB_pred_test= model.predict(X_test_ptb)
        pred_test = np.argmax(PTB_pred_test, axis=-1)

        PTB_df[f"{model_name}_preds"] = pred_test

        del model
        del pred_test
    except Exception as e:
        print(f"\nUnable to generate predictions for {mpath.name}, skipping")
        print(f"Error printout as follows:{e}")
    gc.collect()
    pbar.update(1)
pbar.close()

computing model predictions:   0%|          | 0/5 [00:00<?, ?it/s]

In [56]:

PTB_df_base = ptb_out_path / "ptb_testset_model_predictions"

PTB_df.to_csv(PTB_df_base.with_suffix('.csv'), index=False)
PTB_df.to_excel(PTB_df_base.with_suffix('.xlsx'), index=False)

In [57]:
PTB_df.head()

Unnamed: 0,actual_class,BILSTM187_ptb-2_preds,BidirGRU_ptb_preds,CNN_PTB_preds,GRU_ptb_preds,RNN_PTB_preds
0,0,0,0,0,0,1
1,1,1,1,1,1,1
2,0,0,0,0,0,1
3,1,1,1,1,1,1
4,1,1,1,1,1,1


---

# test basic ensembling with PyCaret

- docs can be found [here](https://pycaret.gitbook.io/docs/get-started/tutorials)


In [62]:
!!pip install -U pycaret[full] -q

[]

In [63]:
from pycaret.utils import enable_colab
enable_colab()

Colab mode enabled.


## try ptb first

In [None]:
from pycaret.classification import setup
ptb_exp = setup(data = PTB_df, target = 'actual_class', session_id=123) 

In [None]:
ptb_best = compare_models()

In [None]:
lr = create_model('lr')

In [None]:
tuned_lr = tune_model(lr)

In [None]:
plot_model(tuned_lr, plot = 'confusion_matrix')

# try lazypredict

In [72]:
!pip install -U lazypredict -q
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[K     |████████████████████████████████| 24.8 MB 2.3 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.1
    Uninstalling scikit-learn-0.23.1:
      Successfully uninstalled scikit-learn-0.23.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pycaret 2.3.9 requires scikit-learn==0.23.2, but you have scikit-learn 1.0.2 which is incompatible.
lazypredict 0.2.9 requires scikit-learn==0.23.1, but you have scikit-learn 1.0.2 which is incompatible.
explainerdashboard 0.3.8 requires pandas>=1.1, but you have pandas 1.0.5 which is incompatible.
evidently 0.1.47.dev1 requires numpy>=1.19.5, but you have numpy 1.19.1 which is incompatible.
evidently 0.1.47

In [73]:
lf_ptb_df = PTB_df.copy()
y = lf_ptb_df.actual_class.values
del lf_ptb_df["actual_class"]
X = lf_ptb_df.values
print(X.shape, y.shape)

(2911, 5) (2911,)


In [74]:
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

ModuleNotFoundError: ignored