# Load Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score,\
                            precision_score, recall_score, accuracy_score,\
                            average_precision_score, precision_recall_curve
from imblearn.over_sampling import RandomOverSampler
from src.helper_functions import load_data, get_model_perfs, init_model_perfs,\
                                 save_model, save_model_perfs, check_is_best,\
                                 read_model, evaluate_model_predictions,\
                                 update_model_perfs, check_and_save,\
                                 adjusted_classes
import pickle
import os
import csv
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from timeit import default_timer as timer
import keras
from keras.callbacks import Callback
from keras.layers import Dense, Activation, Dropout
keras.__version__

'2.2.5'

## References:
* https://hyperopt.github.io/hyperopt/
* https://towardsdatascience.com/an-introductory-example-of-bayesian-optimization-in-python-with-hyperopt-aae40fff4ff0
* https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a
* https://towardsdatascience.com/a-conceptual-explanation-of-bayesian-model-based-hyperparameter-optimization-for-machine-learning-b8172278050f

### Callback Class
See https://www.kaggle.com/inspector/keras-hyperopt-example-sketch/log
if I need this again.

## Continue Search? (If trials data exists)
Helpful reference: https://github.com/hyperopt/hyperopt/issues/267

In [3]:
Continue_Existing_Trials = True

# Load Data & Balance

In [4]:
x_tr, y_tr, x_te, y_te = load_data()
target = y_tr.columns[0]

# Get previous trial data if it exists and initialize trials object
# to be able to see our results after algorithm is complete
target_path = './models/'+target.replace('.','_')+'/'
if os.path.exists(os.path.join(target_path,'trials.pkl')) and Continue_Existing_Trials:
    trials = pickle.load(open(os.path.join(target_path,'trials.pkl'), "rb"))
    ITERATION = trials.results[-1]['iteration']
else:
    trials = Trials()
    ITERATION = 0

# Determine rows with available data
rows_tr = np.isfinite(y_tr[target]).values
rows_te = np.isfinite(y_te[target]).values
x,y = x_tr[rows_tr], y_tr[target][rows_tr]

# Address Class Imbalance
x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, \
                                                  test_size=0.2, random_state=42)
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_sample(x_train,y_train)

# Set up Tracking File
This allows for monitoring progress even while optimization is running.  It is not the same as the trials object which keeps track of optimization progress details.

In [5]:
def set_up_tracking_file(out_file):
    # File to save first results
    of_connection = open(out_file, 'w')
    writer = csv.writer(of_connection)

    # Write the headers to the file
    writer.writerow(['loss', 'params', 'iteration', 'train_time'])
    of_connection.close()

out_file = os.path.join(target_path,'tracking.csv')
if os.path.exists(out_file) and Continue_Existing_Trials:
    print(out_file,' ready for continued tracking.')
    # All ready to continue.
else:
    set_up_tracking_file(out_file)

./models/NR_AhR/tracking.csv  ready for continued tracking.


# Define Optimization Space

In [6]:
space = {'layers': hp.choice('layers', range(1,5)),
         'drop_out': hp.uniform('drop_out', 0, 0.5),
         'L2_reg': hp.choice('L2_reg', [0.01, 0.001, 0.0001, 0.0001]),
         'act': hp.choice('act', ['sigmoid', 'tanh']),
         'neurons': hp.choice('neurons', [512, 1024, 2048, 4096, 8192]),
         'decay': hp.choice('decay', [0, 10**-6, 10**-5, 10**-4]),
         'learn_rate': hp.choice('learn_rate', [0.001, 0.01, 0.1, 1 ]),
         'batch_size': hp.choice('batch_size', [256, 512, 1024])
        }

# Define Objective Function

In [7]:
def objective(params):
    # Save current state of trials object so we can recover from issues
    pickle.dump(trials, open(os.path.join(target_path,'trials.pkl'), "wb"))
                             
    # Keep track of evals
    global ITERATION
    ITERATION += 1
    
    # Build Dense Neural Network
    DNN = keras.Sequential()
    DNN.add(Dense(params['neurons'], activation=params['act'],input_shape=x_tr.shape[1:],name='h0_'+params['act']+'_activation'))
    DNN.add(Dropout(rate=params['drop_out'],name='Dropout0'))
    for i in range(1,params['layers']):
        DNN.add(Dense(units=params['neurons'], activation=params['act'],\
                      name='h'+str(i)+'_'+params['act']+'_activation',\
                      kernel_regularizer=keras.regularizers.l2(params['L2_reg'])))
        DNN.add(Dropout(rate=params['drop_out'],name='Dropout'+str(i)))
    DNN.add(Dense(units=1, activation='sigmoid'))
    keras.optimizers.Adam(lr=params['learn_rate'], beta_1=0.9,\
                          beta_2=0.999, decay=params['decay'], amsgrad=False)
    DNN.compile(optimizer='adam', loss='binary_crossentropy',\
                metrics=['accuracy'])
    
    # Train the DNN
    start = timer()
    DNN.fit(
        x_resampled, y_resampled, batch_size=params['batch_size'], epochs=200,\
        validation_data=(x_val,y_val), verbose=0,
        callbacks=[
            keras.callbacks.EarlyStopping(patience=8,verbose=0,\
                                          restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(factor=0.5,patience=3,verbose=0)
        ])
    train_time = timer() - start
    
    # Get predictions, calculate model performance and save info
    p_te = DNN.predict(x_te[rows_te])[:,0]
    y_testing=y_te[target][rows_te]
    auc_te = roc_auc_score(y_testing, p_te)
    print("%15s: %3.5f" % (target, auc_te))

    y_hat_testing=DNN.predict_classes(x_te[rows_te])
    average_precision=average_precision_score(y_testing,p_te)
    mv=evaluate_model_predictions(target,'DNN',0.5,y_testing,y_hat_testing,\
                                  auc_te,average_precision)
    filename = check_and_save(target,mv,DNN,True)
    
    # Find max F1 varying probability threshold, calculate modified performance, save
    precision, recall, thresholds = precision_recall_curve(y_testing,p_te)
    # f1 = 2*precision*recall/(precision+recall)  # Sometimes precision=recall=0!
    p,r,t = zip(*[i for i in zip(precision,recall,np.append(thresholds,1)) if i[0:2]!=(0,0)])
    p,r,t = np.array(p),np.array(r),np.array(t)
    f1 = 2*p*r/(p+r)
    m_idx = np.argmax(f1)
    m_thresh = thresholds[m_idx]
    y_hat_testing_adj=adjusted_classes(p_te,m_thresh)
    mv=evaluate_model_predictions(target,'DNN_modT',m_thresh,y_testing,y_hat_testing_adj,\
                                  auc_te,average_precision)
    if filename is None:
        check_and_save(target,mv,DNN,True)
    else:
        check_and_save(target,mv,filename,True)
    display(get_model_perfs(target))
    
    # Score to be used in evaluation
    
    score = mv['f1']
    
    # Loss must be minimized
    
    loss = 1 - score
    
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, train_time])
    
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'train_time': train_time, 'status': STATUS_OK}

# Execution [possibly continuation] of the Optimization

In [8]:
# Optimize
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 250,
            trials= trials)
best

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
         NR.AhR: 0.85391                             
Model performance not better than that previously recorded.
Model performance not better than that previously recorded.
  0%|          | 0/197 [00:53<?, ?it/s, best loss: ?]

Unnamed: 0,model,threshold,accuracy,precision,recall,f1,auc_roc,avg_precision,confusion_matrix,model_filename
0,RF,0.5,0.911475,0.756757,0.383562,0.509091,0.905028,0.602102,"[[528, 9], [45, 28]]",RF0.joblib
1,RF_modT,0.235,0.842623,0.424837,0.890411,0.575221,0.905028,0.602102,"[[449, 88], [8, 65]]",RF0.joblib
2,DNN,0.5,0.814754,0.364865,0.739726,0.488688,0.864723,0.524576,"[[443, 94], [19, 54]]",DNN0.h5
3,DNN_modT,0.773516,0.901639,0.603175,0.520548,0.558824,0.864723,0.524576,"[[512, 25], [35, 38]]",DNN0.h5
4,DNN,0.5,0.895082,0.561644,0.561644,0.561644,0.860935,0.495738,"[[505, 32], [32, 41]]",DNN2.h5
5,DNN_modT,0.488593,0.893443,0.551282,0.589041,0.569536,0.860935,0.495738,"[[502, 35], [30, 43]]",DNN2.h5
6,DNN,0.5,0.780328,0.323699,0.767123,0.455285,0.86795,0.573394,"[[420, 117], [17, 56]]",DNN3.h5
7,DNN_modT,0.600385,0.885246,0.516854,0.630137,0.567901,0.86795,0.573394,"[[494, 43], [27, 46]]",DNN3.h5
8,DNN,0.5,0.872131,0.474747,0.643836,0.546512,0.871126,0.553833,"[[485, 52], [26, 47]]",DNN4.h5
9,DNN_modT,0.496647,0.87541,0.485714,0.69863,0.573034,0.871126,0.553833,"[[483, 54], [22, 51]]",DNN4.h5


         NR.AhR: 0.72756                                                        
Model performance not better than that previously recorded.                     
Model performance not better than that previously recorded.                     
  1%|          | 1/197 [04:16<2:53:29, 53.11s/it, best loss: 0.4117647058823529]

  'precision', 'predicted', average, warn_for)

  'precision', 'predicted', average, warn_for)



Unnamed: 0,model,threshold,accuracy,precision,recall,f1,auc_roc,avg_precision,confusion_matrix,model_filename
0,RF,0.5,0.911475,0.756757,0.383562,0.509091,0.905028,0.602102,"[[528, 9], [45, 28]]",RF0.joblib
1,RF_modT,0.235,0.842623,0.424837,0.890411,0.575221,0.905028,0.602102,"[[449, 88], [8, 65]]",RF0.joblib
2,DNN,0.5,0.814754,0.364865,0.739726,0.488688,0.864723,0.524576,"[[443, 94], [19, 54]]",DNN0.h5
3,DNN_modT,0.773516,0.901639,0.603175,0.520548,0.558824,0.864723,0.524576,"[[512, 25], [35, 38]]",DNN0.h5
4,DNN,0.5,0.895082,0.561644,0.561644,0.561644,0.860935,0.495738,"[[505, 32], [32, 41]]",DNN2.h5
5,DNN_modT,0.488593,0.893443,0.551282,0.589041,0.569536,0.860935,0.495738,"[[502, 35], [30, 43]]",DNN2.h5
6,DNN,0.5,0.780328,0.323699,0.767123,0.455285,0.86795,0.573394,"[[420, 117], [17, 56]]",DNN3.h5
7,DNN_modT,0.600385,0.885246,0.516854,0.630137,0.567901,0.86795,0.573394,"[[494, 43], [27, 46]]",DNN3.h5
8,DNN,0.5,0.872131,0.474747,0.643836,0.546512,0.871126,0.553833,"[[485, 52], [26, 47]]",DNN4.h5
9,DNN_modT,0.496647,0.87541,0.485714,0.69863,0.573034,0.871126,0.553833,"[[483, 54], [22, 51]]",DNN4.h5


         NR.AhR: 0.87777                                                        
Model performance not better than that previously recorded.                     
Model performance not better than that previously recorded.                     
  1%|          | 2/197 [04:31<5:19:30, 98.31s/it, best loss: 0.4117647058823529]

Unnamed: 0,model,threshold,accuracy,precision,recall,f1,auc_roc,avg_precision,confusion_matrix,model_filename
0,RF,0.5,0.911475,0.756757,0.383562,0.509091,0.905028,0.602102,"[[528, 9], [45, 28]]",RF0.joblib
1,RF_modT,0.235,0.842623,0.424837,0.890411,0.575221,0.905028,0.602102,"[[449, 88], [8, 65]]",RF0.joblib
2,DNN,0.5,0.814754,0.364865,0.739726,0.488688,0.864723,0.524576,"[[443, 94], [19, 54]]",DNN0.h5
3,DNN_modT,0.773516,0.901639,0.603175,0.520548,0.558824,0.864723,0.524576,"[[512, 25], [35, 38]]",DNN0.h5
4,DNN,0.5,0.895082,0.561644,0.561644,0.561644,0.860935,0.495738,"[[505, 32], [32, 41]]",DNN2.h5
5,DNN_modT,0.488593,0.893443,0.551282,0.589041,0.569536,0.860935,0.495738,"[[502, 35], [30, 43]]",DNN2.h5
6,DNN,0.5,0.780328,0.323699,0.767123,0.455285,0.86795,0.573394,"[[420, 117], [17, 56]]",DNN3.h5
7,DNN_modT,0.600385,0.885246,0.516854,0.630137,0.567901,0.86795,0.573394,"[[494, 43], [27, 46]]",DNN3.h5
8,DNN,0.5,0.872131,0.474747,0.643836,0.546512,0.871126,0.553833,"[[485, 52], [26, 47]]",DNN4.h5
9,DNN_modT,0.496647,0.87541,0.485714,0.69863,0.573034,0.871126,0.553833,"[[483, 54], [22, 51]]",DNN4.h5


  2%|▏         | 3/197 [05:25<5:51:00, 108.56s/it, best loss: 0.4117647058823529]


KeyboardInterrupt: 

# Save the final state of the trials object

In [None]:
pickle.dump(trials, open(os.path.join(target_path,'trials.pkl')", "wb"))