<a href="https://colab.research.google.com/github/pszemraj/ml4hc-s22-project01/blob/update-notebooks-2/notebooks/colab/autogluon_tabular_mitbih.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <center> autogluon for MIT-BIH dataset <center>

- see how autoML does on MIT-BIH dataset
- docs can be found [here](https://auto.gluon.ai/stable/tutorials/tabular_prediction/tabular-quickstart.html)


---



## setup

In [1]:
#@markdown add auto-Colab formatting with `IPython.display`
from IPython.display import HTML, display
# colab formatting
def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )

get_ipython().events.register("pre_run_cell", set_css)

In [2]:
#@title print out GPU info
#@markdown this is the Colab-allocated GPU. If the output here says it fails, no
#@markdown GPU is being used. go to runtime at the top of your colab to set runtime to GPU.


!nvidia-smi

Sun Mar 27 20:46:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install autogluon -q 

[K     |████████████████████████████████| 188 kB 14.5 MB/s 
[K     |████████████████████████████████| 133 kB 71.8 MB/s 
[K     |████████████████████████████████| 48 kB 6.4 MB/s 
[K     |████████████████████████████████| 59 kB 6.2 MB/s 
[K     |████████████████████████████████| 267 kB 52.5 MB/s 
[K     |████████████████████████████████| 802 kB 47.5 MB/s 
[K     |████████████████████████████████| 38.1 MB 1.7 MB/s 
[K     |████████████████████████████████| 132 kB 53.5 MB/s 
[K     |████████████████████████████████| 1.0 MB 44.5 MB/s 
[K     |████████████████████████████████| 54.7 MB 1.2 MB/s 
[K     |████████████████████████████████| 296 kB 70.6 MB/s 
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
[K     |████████████████████████████████| 187 kB 73.6 MB/s 
[K     |████████████████████████████████| 166.7 MB 18 kB/s 
[K     |████████████████████████████████| 2.0 MB 58.2 MB/s 
[K     |████████████████████████████████| 4.3 MB 51.2 MB/s 
[K     |███████████████████

In [4]:
#@title define source data parameters

#@markdown - these can also be loaded from gdrive, but I am lazy and `wget` does not require login

mitbih_train_url = "https://www.dropbox.com/s/2ks8s82tm7jvhse/torchfmt_mitbih_train.csv?dl=1" #@param {type:"string"}
mitbih_train_filename = "mitbih_train.csv" #@param {type:"string"}
mitbih_test_url = "https://www.dropbox.com/s/nbaxenoehvqmqnm/torchfmt_mitbih_test.csv?dl=1" #@param {type:"string"}
mitbih_test_filename = "mitbih_test.csv" #@param {type:"string"}

In [5]:
#@title autogluon parameters 

#@markdown - `max_time_fitting` is in seconds

max_time_fitting =  10800#@param {type:"integer"}
fit_preset = "best_quality" #@param ["best_quality", "high_quality"]
metric = 'accuracy'  #@param ["roc_auc", "accuracy", "f1"]



### util functions

In [6]:
from pathlib import Path
import os
def scour_for_file(
                    file_name:str, 
                    search_root=None, 
                    return_first=True, 
                    verbose=False,
                ):
    """
    scour_for_file - search every possible location for a file name recursively.
     Return the full path to file(s)
    """
    search_root = Path.cwd() if search_root is None else Path(search_root)
    matches = []
    for root, dirs, files in os.walk(search_root.resolve()):
        if file_name in files:
            _path2file = Path(root) / file_name
            if return_first:
                return _path2file.resolve()
            else:
                matches.append(_path2file.resolve())
    if len(matches) < 1:
        print(f"NOTE: zero matches found for {file_name} in {search_root}")
        return None
    else:
        print(f"{len(matches)} matches found for {file_name} in {search_root}")

        if verbose:
            print(f"matched paths are as follows:\n{matches}")
            if return_first: print("will return only topmost path")
        return matches[0] if return_first else matches


In [13]:
from google.colab import files
from pathlib import Path
import pathlib
import time
import warnings
from tqdm.auto import tqdm

def download_colab(my_files):
    """
    helper function to download things out of colab. 
    
    input can be a list, string, or Path() that refers to the file.
    """
    if my_files is None:
        warnings.warn(message=f"received {my_files}, nothing to DL")
        return None

    
    if isinstance(my_files, list):
        filepaths = [f for f in my_files if Path(f).exists() and Path(f).is_file()]
        pbar = tqdm(total=len(filepaths), desc="downloading files..")
        for filepath in filepaths:
            files.download(filepath)
            pbar.update(1)
            time.sleep(3) # prevent browsers auto-blocking downloads
        pbar.close()
    else:
        assert isinstance(my_files, str) or isinstance(my_files, pathlib.Path), "non-list needs to be str or Path()"
        single_file = Path(my_files)
        files.download(single_file)




### data

In [8]:

!wget $mitbih_train_url -O $mitbih_train_filename -q
!wget $mitbih_test_url -O $mitbih_test_filename -q

In [9]:
import pandas as pd
example_df = pd.read_csv(mitbih_train_filename)
data_cols = list(example_df.columns)
_target = data_cols[-1]
data_cols.pop()
_predictors = data_cols # all other columns are numerical predictors

print(f"the target colname is {_target} and\nthe predictor colnames 5 of {len(_predictors)} are {_predictors[:5]}")

the target colname is class_label and
the predictor colnames 5 of 187 are ['feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4']


## fit model

In [10]:
import torch
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset(mitbih_train_filename)
test_data = TabularDataset(mitbih_test_filename)
predictor = TabularPredictor(
                            label=_target,
                             eval_metric=metric,
                             ).fit(
                                    train_data, 
                                    time_limit=max_time_fitting,
                                    presets=fit_preset,
                                   ag_args_fit={'num_gpus': torch.cuda.device_count()}
                                   )  


No path specified. Models will be saved in: "AutogluonModels/ag-20220327_204858/"
Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 10800s
AutoGluon will save models to "AutogluonModels/ag-20220327_204858/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    87554
Train Data Columns: 187
Label Column: class_label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	5 unique label values:  ['N', 'S', 'V', 'F', 'Q']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 5
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11927.32 MB
	Train Data (Original)  Memory Usage: 130.98 MB (1.1% of 

In [11]:
results = predictor.fit_summary(show_plot=True)


*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.987756     790.527368  8732.493713                0.007183          12.085104            3       True         20
1           LightGBM_BAG_L2   0.987505     736.025195  6830.926788                1.786115         259.455806            2       True         11
2            XGBoost_BAG_L2   0.987493     736.120318  6614.919820                1.881239          43.448839            2       True         17
3    NeuralNetFastAI_BAG_L2   0.987471     738.069256  7185.616762                3.830176         614.145781            2       True          9
4   RandomForestEntr_BAG_L2   0.987402     746.033134  6843.372016               11.794054         271.901035            2       True         13
5   RandomForestGini_BAG_L2   0.987254     746.547559  6863.764176  

In [14]:
summary_report = scour_for_file('SummaryOfModels.html')
download_colab(summary_report)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:

leaderboard = predictor.leaderboard(test_data,)

leaderboard

                      model  score_test  score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   RandomForestGini_BAG_L2    0.987347   0.987254      364.362037     746.547559  6863.764176                 1.035578               12.308479         292.293195            2       True         12
1   RandomForestEntr_BAG_L2    0.987256   0.987402      364.258731     746.033134  6843.372016                 0.932272               11.794054         271.901035            2       True         13
2       WeightedEnsemble_L3    0.987164   0.987756      389.968791     790.527368  8732.493713                 0.012632                0.007183          12.085104            3       True         20
3           LightGBM_BAG_L2    0.987073   0.987505      365.506774     736.025195  6830.926788                 2.180315                1.786115         259.455806            2       True         11
4    Neura

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestGini_BAG_L2,0.987347,0.987254,364.362037,746.547559,6863.764176,1.035578,12.308479,292.293195,2,True,12
1,RandomForestEntr_BAG_L2,0.987256,0.987402,364.258731,746.033134,6843.372016,0.932272,11.794054,271.901035,2,True,13
2,WeightedEnsemble_L3,0.987164,0.987756,389.968791,790.527368,8732.493713,0.012632,0.007183,12.085104,3,True,20
3,LightGBM_BAG_L2,0.987073,0.987505,365.506774,736.025195,6830.926788,2.180315,1.786115,259.455806,2,True,11
4,NeuralNetFastAI_BAG_L2,0.987027,0.987471,368.599604,738.069256,7185.616762,5.273145,3.830176,614.145781,2,True,9
5,ExtraTreesEntr_BAG_L2,0.98689,0.987025,364.553864,746.413669,6608.139637,1.227406,12.17459,36.668655,2,True,16
6,ExtraTreesGini_BAG_L2,0.986845,0.987094,364.638156,746.634367,6615.154789,1.311697,12.395287,43.683808,2,True,15
7,XGBoost_BAG_L2,0.986479,0.987493,365.984499,736.120318,6614.91982,2.65804,1.881239,43.448839,2,True,17
8,WeightedEnsemble_L2,0.986342,0.986705,109.982093,100.285594,4187.7309,0.005897,0.006827,9.26678,2,True,8
9,CatBoost_BAG_L2,0.986342,0.987025,363.798291,734.654867,6622.765036,0.471832,0.415787,51.294055,2,True,14


In [31]:
best_model = TabularPredictor.load("/content/AutogluonModels/ag-20220327_204858")

In [None]:
feat_importance = best_model.feature_importance(
                                                test_data,
                                               time_limit=600,
                    )
feat_importance

Computing feature importance via permutation shuffling for 187 features using 1000 rows with 10 shuffle sets... Time limit: 600s...
	47064.84s	= Expected runtime (4706.48s per shuffle set)


In [None]:
type(feat_importance)

## export results

In [16]:
!pip install openpyxl -q

In [17]:
from google.colab import files
import pandas as pd
from pathlib import Path

_cwd = Path.cwd()

csv_path = _cwd / "mitbih_autogluon_results.csv"
xlsx_path = _cwd / "mitbih_autogluon_results.xlsx"

leaderboard.to_csv(csv_path)
leaderboard.to_excel(xlsx_path)


In [18]:
download_colab([csv_path, xlsx_path])


downloading files..:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
def export_model_files():
    learner_path = Path(scour_for_file("learner.pkl", verbose=True))
    predictor_path = Path(scour_for_file("predictor.pkl", verbose=True))

    learner_renamed_path = learner_path.parent / f"mitbih_{learner_path.name}"
    predictor_renamed_path = predictor_path.parent / f"mitbih_{predictor_path.name}"
    old_paths = [learner_path, predictor_path]
    new_paths = [learner_renamed_path, predictor_renamed_path]

    for old, new in zip(old_paths, new_paths):
        os.rename(old, new)

    download_colab(new_paths)


In [21]:
export_model_files()

downloading files..:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>