# ECBM 4040 Fall '21 Project  - BIOM Group

## Initialization

In [None]:
import numpy as np
import tensorflow as tf
import pickle
from matplotlib import pyplot as plt

import shutil
import os

from model.models_cstm import get_embedding_model
from model.train_model import train_siamese_model

!pip3 install keras-tuner
import keras_tuner


In [None]:
DISTANCE_METRICS = {
    'EUCLIDEAN': 'euclidean',
    'HYPERBOLIC': 'hyperbolic',
    'MANHATTAN': 'manhattan',
    'SQUARE': 'square',
    'COSINE': 'cosine'
}

In [None]:
# set random number seeds for reproducible results
np.random.seed(1)
tf.random.set_seed(1)

## Get Qiita Data

In [None]:
!wget https://www.dropbox.com/s/mv546rx259tgwaz/qiita_numpy.pkl

In [None]:
cwd = os.getcwd()
shutil.move(f"{cwd}/qiita_numpy.pkl", f"{cwd}/data/qiita/qiita_numpy.pkl")

## Load Qiita Dataset

In [None]:
# Load QIITA dataset.
((X_train, X_test, X_val), (y_train, y_test, y_val)) = pickle.load(open(f"{cwd}/data/qiita/qiita_numpy.pkl", "rb"))

## Distance Function Hyperparam Tuning 

In [None]:
embedding = get_embedding_model()
embedding.summary()

In [None]:
# Train and Test Siamese Model

data = ((X_train, X_test, X_val), (y_train, y_test, y_val))

dist_func_tunning = {}
for key in DISTANCE_METRICS.keys():
    dist = DISTANCE_METRICS[key]
    _, score, history = train_siamese_model(data, embedding, dist , batch_size=256, epochs = 1)
    dist_func_tunning[key] = {'score': score, 'history': history.history}

In [None]:
file_to_write = open("results/dense/dist_func_tunning.pkl", "wb")
pickle.dump(dist_func_tunning, file_to_write)

## Visualize Dist_Func_Tunning 

In [None]:
# TODO 

# HyperParam Tunning with RandomSearchCV

In [None]:
from model.layer import DistanceLayer
from model.models_cstm import SiameseModel
from model.generator import SequenceDistDataGenerator

## RandomSearchCV Hyperparam Tuning (w/ Hyperbolic)

In [None]:
def model_builder(hp):
    # Model definitions
    in1 = tf.keras.layers.Input(name="sequence1", shape=(152,))
    in2 = tf.keras.layers.Input(name="sequence2", shape=(152,))
    
    embedding_model = get_embedding_model()
    
    distance = DistanceLayer(metric=DISTANCE_METRICS['HYPERBOLIC'], dynamic=True)(
        embedding_model(in1), 
        embedding_model(in2)
    )

    siamese_network = tf.keras.models.Model(
        inputs=[in1, in2],
        outputs=distance
    )
    
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])
    optimizer  = tf.keras.optimizers.Adam(hp_learning_rate)
    
    model = SiameseModel(siamese_network) # Depends on SiameseModel class, which we can define elsewhere
    model.compile(optimizer=optimizer) # run_eagerly is not necessary, but useful for debugging
    
    return model

class MyTuner(keras_tuner.tuners.RandomSearch):
    def run_trial(self, trial, *args, **kwargs):
        # You can add additional HyperParameters for preprocessing and custom training loops
        # via overriding `run_trial`
        kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', 32, 512, step=32)
        kwargs['epochs'] = trial.hyperparameters.Int('epochs', 5, 30, step = 2)
        super(MyTuner, self).run_trial(trial, *args, **kwargs)

# Uses same arguments as the BayesianOptimization Tuner.
tuner = MyTuner(model_builder, objective='val_loss' )

training_generator = SequenceDistDataGenerator( X_train, y_train )
validation_generator = SequenceDistDataGenerator( X_val, y_val)

# Don't pass epochs or batch_size here, let the Tuner tune them.
tuner.search(training_generator, validation_data=validation_generator)

In [None]:
# Get the optimal hyperparameters
best_hps_HYPERBOLIC=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps_HYPERBOLIC

## RandomSearchCV Hyperparam Tuning (w/ EUCLIDEAN)

In [None]:
def model_builder(hp):
    # Model definitions
    in1 = tf.keras.layers.Input(name="sequence1", shape=(152,))
    in2 = tf.keras.layers.Input(name="sequence2", shape=(152,))
    
    embedding_model = get_embedding_model()
    
    distance = DistanceLayer(metric=DISTANCE_METRICS['EUCLIDEAN'], dynamic=True)(
        embedding_model(in1), 
        embedding_model(in2)
    )

    siamese_network = tf.keras.models.Model(
        inputs=[in1, in2],
        outputs=distance
    )
    
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])
    optimizer  = tf.keras.optimizers.Adam(hp_learning_rate)
    
    model = SiameseModel(siamese_network) # Depends on SiameseModel class, which we can define elsewhere
    model.compile(optimizer=optimizer) # run_eagerly is not necessary, but useful for debugging
    
    return model

class MyTuner(keras_tuner.tuners.RandomSearch):
    def run_trial(self, trial, *args, **kwargs):
        # You can add additional HyperParameters for preprocessing and custom training loops
        # via overriding `run_trial`
        kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', 32, 512, step=32)
        kwargs['epochs'] = trial.hyperparameters.Int('epochs', 5, 30, step = 2)
        super(MyTuner, self).run_trial(trial, *args, **kwargs)

# Uses same arguments as the BayesianOptimization Tuner.
tuner = MyTuner(model_builder, objective='val_loss' )

training_generator = SequenceDistDataGenerator( X_train, y_train )
validation_generator = SequenceDistDataGenerator( X_val, y_val)

# Don't pass epochs or batch_size here, let the Tuner tune them.
tuner.search(training_generator, validation_data=validation_generator)

In [None]:
# Get the optimal hyperparameters
best_hps_EUCLIDEAN=tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps_EUCLIDEAN