In [1]:
import os, pickle, datetime, functools
import pathos.multiprocessing as mp
import multiprocessing.dummy as mpd
from multiprocessing import TimeoutError
import sklearn.metrics as skm
import numpy as np
import chocolate as choco

In [2]:
from constants import PROCESSED_PATH, RAW_PATH, DATA_PATH
from massageData import runPipeline, readData
from sklearn.preprocessing import normalize, LabelBinarizer

In [3]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

In [4]:
def prepareData(datafn, ids_fn, one_hot=False):
    df = readData(datafn)
    df = df[df.columns[[0,1,4,2,3]+list(range(5,len(df.columns)))]]

    featuresCols = df.columns[3:-2]

    split_df = runPipeline(df, ids_fn, featuresCols)

    test = split_df['test']
    valid = split_df['valid']
    devel = split_df['devel']

    train = devel.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE', 'P TSTAGE', 'P STAGE'], axis=1)
    testv = test.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE', 'P TSTAGE', 'P STAGE'], axis=1)

    x_train = train.values[:, :-2]
    y_train = train.values[:, -1]
    x_train = normalize(x_train, axis=0)

    x_test = testv.values[:, :-2]
    x_test = normalize(x_test, axis=0)
    y_test = testv.values[:, -1]

    if one_hot:
        ohe = LabelBinarizer()
        ohe.fit(y_train.reshape(-1, 1))
        y_train = ohe.transform(y_train.reshape(-1,1))
        y_test = ohe.transform(y_test.reshape(-1,1))
    
    return x_train, y_train, x_test, y_test

In [5]:
def f1_score_model(trn_x, trn_y, tst_x, tst_y, model, **params):
    m = models[model](**params)
    m.fit(trn_x, trn_y)
    y_pred = m.predict(tst_x)
    return -skm.f1_score(tst_y, y_pred, average='macro')

In [6]:
def load_or_gen_data(datafn, ids_fn):
    folder, fn = os.path.split(datafn)
    check = os.path.join(folder, fn.rsplit('.')[0] + '.pickle')
    if os.path.isfile(check):
        with open(check, 'rb') as f:
            return pickle.load(f)
    else:
        with open(check, 'wb') as f:
            d = prepareData(datafn, ids_fn)
            pickle.dump(d, f)
            return d

In [7]:
datafn = 'HOUR_00003.csv'
datafn = os.path.join(DATA_PATH, 'hour', datafn)
ids_fn = os.path.join(RAW_PATH, 'd_ids_split.pickle')

In [8]:
trn_x, trn_y, tst_x, tst_y = load_or_gen_data(datafn, ids_fn)

In [9]:
space = [
    {'model': 'SVC',
        "gamma": 'auto',
        "C": choco.log(-3, 3, 10),
        "kernel": choco.choice(['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
        "tol": choco.log(-5, -2, 10),},
    {'model': 'XGBClassifier',
        "learning_rate"   : choco.uniform(0.001, 0.1),
        "max_depth"       : choco.quantized_uniform(2, 16, 2),
        "min_child_weight": choco.quantized_uniform(2, 10, 2),
        "subsample"       : choco.quantized_uniform(0.7, 1.05, 0.05),
        "n_estimators"    : choco.quantized_uniform(25, 525, 25),},
    {'model': 'RandomForestClassifier',
        "max_depth"       : choco.quantized_uniform(2, 10, 2),
        "min_samples_leaf": choco.quantized_uniform(2, 10, 2),
        "n_estimators"    : choco.quantized_uniform(25, 525, 25),},
    {'model': 'GaussianNB',
        "var_smoothing"   : choco.log(-12, -6, 10)},
    {'model': 'KNeighborsClassifier',
        "n_neighbors"     : choco.quantized_uniform(1, 10, 1),
        "weights"         : choco.choice(['uniform', 'distance']),
        "leaf_size"       : choco.quantized_uniform(15, 315, 20),
        "p"               : choco.choice([1,2,3]),},
    # {'model': MLPClassifier,},
    # {'model': GaussianProcessClassifier}, # this one was giving me an out of memory error
]

In [10]:
models = {
    'SVC': SVC,
    'XGBClassifier': XGBClassifier,
    'RandomForestClassifier': RandomForestClassifier,
    'GaussianNB': GaussianNB,
    'KNeighborsClassifier': KNeighborsClassifier,
}

In [11]:
dbid = datetime.datetime.now().strftime('%m%d%y%H%M%S')
dbid = 1

In [12]:
conn = choco.SQLiteConnection(url="sqlite:///hpo/hpo_%s.db" % str(dbid))
sampler = choco.Random(conn, space)

In [13]:
def timeout(seconds):
    def decorator_timeout(func):
        @functools.wraps(func)
        def wrapper_timout(*args, **kwargs):
            p = mpd.Pool(1)
            res = p.apply_async(func, args=args)
            try:
                out = res.get(seconds)  # Wait timeout seconds for func to complete.
                return out
            except TimeoutError:
                print("Aborting due to timeout")
                p.terminate()
#                 raise
        return wrapper_timout
    return decorator_timeout

In [14]:
def getProcFunc(conn, sampler):
#     @timeout(60)
    def func(i):
        token, params = sampler.next()
        print('START % 4d %s' % (i, params['model']))
        loss = f1_score_model(trn_x, trn_y, tst_x, tst_y, **params)
        sampler.update(token, loss)
        print('DONE  % 4d %s' % (i, params['model']))
    return func

In [15]:
N_RUNS = 16
N_PROC = 8

In [16]:
f = getProcFunc(conn, sampler)
with mp.Pool(processes=N_PROC) as pool:
    pool.map(f, range(N_RUNS))

START    0 XGBClassifier
START    6 KNeighborsClassifier
START    1 RandomForestClassifier
START    2 XGBClassifier
START    3 GaussianNB


  'precision', 'predicted', average, warn_for)


START    4 SVC
START    7 SVC
START    5 SVC
DONE     3 GaussianNB
START    8 SVC


  'precision', 'predicted', average, warn_for)


DONE     1 RandomForestClassifier
START    9 GaussianNB


  'precision', 'predicted', average, warn_for)


DONE     9 GaussianNB
START   10 GaussianNB


  'precision', 'predicted', average, warn_for)


DONE    10 GaussianNB
START   11 GaussianNB


  'precision', 'predicted', average, warn_for)


DONE    11 GaussianNB
START   12 XGBClassifier
DONE     6 KNeighborsClassifier
START   13 RandomForestClassifier


  'precision', 'predicted', average, warn_for)


DONE    13 RandomForestClassifier
START   14 GaussianNB


  'precision', 'predicted', average, warn_for)


DONE    14 GaussianNB
START   15 KNeighborsClassifier


  'precision', 'predicted', average, warn_for)


DONE     5 SVC


  'precision', 'predicted', average, warn_for)


DONE     7 SVC


  'precision', 'predicted', average, warn_for)


DONE     4 SVC


  'precision', 'predicted', average, warn_for)


DONE     8 SVC
DONE    15 KNeighborsClassifier
DONE     0 XGBClassifier
DONE     2 XGBClassifier
DONE    12 XGBClassifier


In [17]:
N_RUNS = 32
searcher = choco.Bayes(conn, space)
f = getProcFunc(conn, sampler)
with mp.Pool(processes=N_PROC) as pool:
    pool.map(f, range(N_RUNS))

START    0 XGBClassifier
START    5 SVC
START    2 XGBClassifier
START    3 SVC
START    4 XGBClassifier
START    6 KNeighborsClassifier
START    7 RandomForestClassifier
START    1 SVC


  'precision', 'predicted', average, warn_for)


DONE     7 RandomForestClassifier
START    8 RandomForestClassifier


  'precision', 'predicted', average, warn_for)


DONE     8 RandomForestClassifier
START    9 XGBClassifier
DONE     6 KNeighborsClassifier
START   10 XGBClassifier


  'precision', 'predicted', average, warn_for)


DONE     5 SVC
START   11 SVC


  'precision', 'predicted', average, warn_for)


DONE     3 SVC
START   12 XGBClassifier


  'precision', 'predicted', average, warn_for)


DONE    12 XGBClassifier
START   13 RandomForestClassifier


  'precision', 'predicted', average, warn_for)


DONE    13 RandomForestClassifier
START   14 XGBClassifier


  'precision', 'predicted', average, warn_for)


DONE     9 XGBClassifier
START   15 RandomForestClassifier


  'precision', 'predicted', average, warn_for)


DONE    11 SVC


  'precision', 'predicted', average, warn_for)


START   16 GaussianNB


  'precision', 'predicted', average, warn_for)


DONE    15 RandomForestClassifier
DONE    16 GaussianNB
START   17 SVC
START   18 KNeighborsClassifier
DONE     2 XGBClassifier
START   19 KNeighborsClassifier


  'precision', 'predicted', average, warn_for)


DONE    17 SVC
START   20 GaussianNB


  'precision', 'predicted', average, warn_for)


DONE    20 GaussianNB
START   21 XGBClassifier
DONE    19 KNeighborsClassifier
START   22 KNeighborsClassifier
DONE     0 XGBClassifier
START   23 KNeighborsClassifier
DONE     4 XGBClassifier
START   24 RandomForestClassifier
DONE    22 KNeighborsClassifier
START   25 RandomForestClassifier
DONE    23 KNeighborsClassifier
START   26 KNeighborsClassifier


  'precision', 'predicted', average, warn_for)


DONE    24 RandomForestClassifier
START   27 KNeighborsClassifier


  'precision', 'predicted', average, warn_for)


DONE    25 RandomForestClassifier
START   28 SVC
DONE    26 KNeighborsClassifier
START   29 RandomForestClassifier


  'precision', 'predicted', average, warn_for)


DONE    29 RandomForestClassifier
START   30 RandomForestClassifier


  'precision', 'predicted', average, warn_for)


DONE    30 RandomForestClassifier
START   31 XGBClassifier


  'precision', 'predicted', average, warn_for)


DONE    28 SVC


  'precision', 'predicted', average, warn_for)


DONE    31 XGBClassifier
DONE    21 XGBClassifier
DONE    10 XGBClassifier
DONE    14 XGBClassifier
DONE    18 KNeighborsClassifier
DONE    27 KNeighborsClassifier


Process ForkPoolWorker-16:
Process ForkPoolWorker-13:
Process ForkPoolWorker-15:
Process ForkPoolWorker-14:
Process ForkPoolWorker-11:
Process ForkPoolWorker-12:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ram/anaconda3/envs/tfenv/lib/python3.6/site-packages/multiprocess/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ram/anaconda3/envs/tfenv/lib/python3.6/site-packages/multiprocess/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ram/anaconda3/envs/tfenv/lib/python3.6/site-packages/multiprocess/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ram/anaconda3/envs/tfenv/lib/python3.6/site-packages/multiprocess/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ram/anaconda3/envs/tfenv/lib/python3.

KeyboardInterrupt: 

In [None]:
df = conn.results_as_dataframe()

In [None]:
df.to_csv("hpo/hpo_%s.csv" % str(dbid))