<a href="https://colab.research.google.com/github/pszemraj/ml4hc-s22-project01/blob/update-notebooks-2/notebooks/colab/autogluon_tabular_ptbdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <center> autogluon for ptbdb dataset <center>

- see how autoML does on ptbdb dataset
- docs can be found [here](https://auto.gluon.ai/stable/tutorials/tabular_prediction/tabular-quickstart.html)


---



## setup

In [1]:
#@markdown add auto-Colab formatting with `IPython.display`
from IPython.display import HTML, display
# colab formatting
def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )

get_ipython().events.register("pre_run_cell", set_css)

In [2]:
#@title print out GPU info
#@markdown this is the Colab-allocated GPU. If the output here says it fails, no
#@markdown GPU is being used. go to runtime at the top of your colab to set runtime to GPU.


!nvidia-smi

Sun Mar 27 20:58:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install autogluon -q 

[K     |████████████████████████████████| 267 kB 13.6 MB/s 
[K     |████████████████████████████████| 188 kB 51.4 MB/s 
[K     |████████████████████████████████| 59 kB 3.7 MB/s 
[K     |████████████████████████████████| 48 kB 5.0 MB/s 
[K     |████████████████████████████████| 133 kB 55.6 MB/s 
[K     |████████████████████████████████| 38.1 MB 63.9 MB/s 
[K     |████████████████████████████████| 802 kB 41.4 MB/s 
[K     |████████████████████████████████| 1.0 MB 49.3 MB/s 
[K     |████████████████████████████████| 132 kB 57.5 MB/s 
[K     |████████████████████████████████| 54.7 MB 179 kB/s 
[K     |████████████████████████████████| 296 kB 71.2 MB/s 
[K     |████████████████████████████████| 187 kB 69.7 MB/s 
[K     |████████████████████████████████| 2.0 MB 55.0 MB/s 
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
[K     |████████████████████████████████| 166.7 MB 18 kB/s 
[K     |████████████████████████████████| 248 kB 76.7 MB/s 
[?25h  Installing build de

In [4]:
#@title define source data parameters

#@markdown - these can also be loaded from gdrive, but I am lazy and `wget` does not require login

ptbdb_train_url = "https://www.dropbox.com/s/nxwz0rlyckcfq0v/torchfmt_ptbdb_train.csv?dl=1" #@param {type:"string"}
ptbdb_train_filename = "ptbdb_train.csv" #@param {type:"string"}
ptbdb_test_url = "https://www.dropbox.com/s/ysey3qdpvgqvoz2/torchfmt_ptbdb_test.csv?dl=1" #@param {type:"string"}
ptbdb_test_filename = "ptbdb_test.csv" #@param {type:"string"}

In [5]:
#@title autogluon parameters 

#@markdown - `max_time_fitting` is in seconds

max_time_fitting =  10800#@param {type:"integer"}
fit_preset = "best_quality" #@param ["best_quality", "high_quality"]
metric = 'accuracy'  #@param ["roc_auc", "accuracy", "f1"]



### util functions

In [6]:
from pathlib import Path
import os
def scour_for_file(
                    file_name:str, 
                    search_root=None, 
                    return_first=True, 
                    verbose=False,
                ):
    """
    scour_for_file - search every possible location for a file name recursively.
     Return the full path to file(s)
    """
    search_root = Path.cwd() if search_root is None else Path(search_root)
    matches = []
    for root, dirs, files in os.walk(search_root.resolve()):
        if file_name in files:
            _path2file = Path(root) / file_name
            if return_first:
                return _path2file.resolve()
            else:
                matches.append(_path2file.resolve())
    if len(matches) < 1:
        print(f"NOTE: zero matches found for {file_name} in {search_root}")
        return None
    else:
        print(f"{len(matches)} matches found for {file_name} in {search_root}")

        if verbose:
            print(f"matched paths are as follows:\n{matches}")
            if return_first: print("will return only topmost path")
        return matches[0] if return_first else matches


In [14]:
from google.colab import files
from pathlib import Path
import pathlib
import time
import warnings
from tqdm.auto import tqdm

def download_colab(my_files):
    """
    helper function to download things out of colab. 
    
    input can be a list, string, or Path() that refers to the file.
    """
    if my_files is None:
        warnings.warn(message=f"received {my_files}, nothing to DL")
        return None

    
    if isinstance(my_files, list):
        filepaths = [f for f in my_files if Path(f).exists() and Path(f).is_file()]
        pbar = tqdm(total=len(filepaths), desc="downloading files..")
        for filepath in filepaths:
            files.download(filepath)
            pbar.update(1)
            time.sleep(3) # prevent browsers auto-blocking downloads
        pbar.close()
    else:
        assert isinstance(my_files, str) or isinstance(my_files, pathlib.Path), "non-list needs to be str or Path()"
        single_file = Path(my_files)
        files.download(single_file)




### data

In [8]:

!wget $ptbdb_train_url -O $ptbdb_train_filename -q
!wget $ptbdb_test_url -O $ptbdb_test_filename -q

In [9]:
import pandas as pd
example_df = pd.read_csv(ptbdb_train_filename)
data_cols = list(example_df.columns)
_target = data_cols[-1]
data_cols.pop()
_predictors = data_cols # all other columns are numerical predictors

print(f"the target colname is {_target} and\nthe predictor colnames 5 of {len(_predictors)} are {_predictors[:5]}")

the target colname is class_label and
the predictor colnames 5 of 187 are ['feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4']


## fit model

In [10]:
import torch
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset(ptbdb_train_filename)
test_data = TabularDataset(ptbdb_test_filename)
predictor = TabularPredictor(
                            label=_target,
                             eval_metric=metric,
                             ).fit(
                                    train_data, 
                                    time_limit=max_time_fitting,
                                    presets=fit_preset,
                                   ag_args_fit={'num_gpus': torch.cuda.device_count()}
                                   )  


No path specified. Models will be saved in: "AutogluonModels/ag-20220327_210049/"
Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 10800s
AutoGluon will save models to "AutogluonModels/ag-20220327_210049/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    11642
Train Data Columns: 187
Label Column: class_label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['abnormal', 'normal']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = normal, class 0 = abnormal
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (normal) vs negative (abnormal) class.
	To explicit

In [11]:
results = predictor.fit_summary(show_plot=True)


*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.991153      55.951196  7574.346129                0.017853           4.621803            3       True         26
1           LightGBM_BAG_L2   0.990981      54.051606  6467.998650                1.016706         452.150008            2       True         16
2    NeuralNetFastAI_BAG_L2   0.990981      60.666399  6712.757208                7.631498         696.908567            2       True         22
3      LightGBMLarge_BAG_L2   0.990637      54.916638  7117.574319                1.881737        1101.725677            2       True         25
4       WeightedEnsemble_L2   0.989950      19.853544  2162.411832                0.017502           5.212323            2       True         14
5           CatBoost_BAG_L2   0.989864      53.675841  6305.821097  

In [15]:
summary_report = scour_for_file('SummaryOfModels.html')
download_colab(summary_report)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
extra_metrics=['accuracy', 'roc_auc', 'log_loss', 'f1']

leaderboard = predictor.leaderboard(test_data, 
                                    # extra_metrics=extra_metrics
                                    )

leaderboard

                      model  score_test  score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   RandomForestEntr_BAG_L1    1.000000   0.976636        0.278267       0.778335    26.411969                 0.278267                0.778335          26.411969            1       True          6
1     ExtraTreesGini_BAG_L1    1.000000   0.974833        0.314414       0.776313     3.463508                 0.314414                0.776313           3.463508            1       True          8
2     ExtraTreesEntr_BAG_L1    1.000000   0.976550        0.344286       0.778710     3.289644                 0.344286                0.778710           3.289644            1       True          9
3   RandomForestGini_BAG_L1    1.000000   0.972513        0.363081       0.788137    15.465272                 0.363081                0.788137          15.465272            1       True          5
4     KNei

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr_BAG_L1,1.0,0.976636,0.278267,0.778335,26.411969,0.278267,0.778335,26.411969,1,True,6
1,ExtraTreesGini_BAG_L1,1.0,0.974833,0.314414,0.776313,3.463508,0.314414,0.776313,3.463508,1,True,8
2,ExtraTreesEntr_BAG_L1,1.0,0.97655,0.344286,0.77871,3.289644,0.344286,0.77871,3.289644,1,True,9
3,RandomForestGini_BAG_L1,1.0,0.972513,0.363081,0.788137,15.465272,0.363081,0.788137,15.465272,1,True,5
4,KNeighborsDist_BAG_L1,1.0,0.935922,1.927542,5.650252,0.360595,1.927542,5.650252,0.360595,1,True,2
5,XGBoost_BAG_L1,1.0,0.981876,3.178915,2.664357,177.153115,3.178915,2.664357,177.153115,1,True,11
6,NeuralNetFastAI_BAG_L1,1.0,0.987373,5.470562,5.528364,578.66454,5.470562,5.528364,578.66454,1,True,10
7,LightGBM_BAG_L1,1.0,0.985999,7.89779,5.753955,924.60516,7.89779,5.753955,924.60516,1,True,4
8,LightGBMXT_BAG_L1,1.0,0.985484,7.911883,6.210564,631.711384,7.911883,6.210564,631.711384,1,True,3
9,LightGBMLarge_BAG_L1,1.0,0.983937,8.944115,5.788616,1608.346876,8.944115,5.788616,1608.346876,1,True,13


In [None]:
feat_importance = predictor.feature_importance(
                                                test_data,
                                               time_limit=600,
                    )
feat_importance

## export results

In [17]:
!pip install openpyxl -q

In [18]:
from google.colab import files
import pandas as pd
from pathlib import Path

_cwd = Path.cwd()

csv_path = _cwd / "ptbdb_autogluon_results.csv"
xlsx_path = _cwd / "ptbdb_autogluon_results.xlsx"

leaderboard.to_csv(csv_path)
leaderboard.to_excel(xlsx_path)


In [19]:
download_colab([csv_path, xlsx_path])


downloading files..:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
def export_model_files():
    learner_path = Path(scour_for_file("learner.pkl", verbose=True))
    predictor_path = Path(scour_for_file("predictor.pkl", verbose=True))

    learner_renamed_path = learner_path.parent / f"ptbdb_{learner_path.name}"
    predictor_renamed_path = predictor_path.parent / f"ptbdb_{predictor_path.name}"
    old_paths = [learner_path, predictor_path]
    new_paths = [learner_renamed_path, predictor_renamed_path]

    for old, new in zip(old_paths, new_paths):
        os.rename(old, new)

    download_colab(new_paths)


In [21]:
export_model_files()

downloading files..:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>