# ECBM 4040 Fall '21 Project  - BIOM Group

## Initialization

In [1]:
import numpy as np
import tensorflow as tf
import pickle
from matplotlib import pyplot as plt

import shutil
import os

from model.models_cstm import get_embedding_model
from model.train_model import train_siamese_model


In [2]:
DISTANCE_METRICS = {
    'EUCLIDEAN': 'euclidean',
    'HYPERBOLIC': 'hyperbolic',
    'MANHATTAN': 'manhattan',
    'SQUARE': 'square',
    'COSINE': 'cosine'
}

In [3]:
# set random number seeds for reproducible results
np.random.seed(1)
tf.random.set_seed(1)

## Get Qiita Data

In [4]:
!wget https://www.dropbox.com/s/mv546rx259tgwaz/qiita_numpy.pkl

--2021-12-17 17:16:47--  https://www.dropbox.com/s/mv546rx259tgwaz/qiita_numpy.pkl
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/mv546rx259tgwaz/qiita_numpy.pkl [following]
--2021-12-17 17:16:48--  https://www.dropbox.com/s/raw/mv546rx259tgwaz/qiita_numpy.pkl
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf5090aa1e261eb3631a520b44f.dl.dropboxusercontent.com/cd/0/inline/BcBpTvH9uhHoRWJIt70fG1H_a0mjUZ3QTH_qu1hJUcDmCN4jJlHAAdt1_CZzD0RcU30KxhnZu9vadAXC_fP1b6wRfzS57Z5bNn-y5x00umzoCx7fN1iWfHVpoOSjBACuE-0GFmI6ePM50k8XNozWIctK/file# [following]
--2021-12-17 17:16:48--  https://ucf5090aa1e261eb3631a520b44f.dl.dropboxusercontent.com/cd/0/inline/BcBpTvH9uhHoRWJIt70fG1H_a0mjUZ3QTH_qu1hJUcDmCN4jJlHAAdt1_CZzD0RcU30KxhnZu9vadAXC_fP1b6wRfzS57Z5bNn-y5x00umzoC

In [5]:
cwd = os.getcwd()
shutil.move(f"{cwd}/qiita_numpy.pkl", f"{cwd}/data/qiita/qiita_numpy.pkl")

'/Users/rohithravin/Github/ECBM4040-NuroSEED-Proj/data/qiita/qiita_numpy.pkl'

## Load Qiita Dataset

In [6]:
# Load QIITA dataset.
((X_train, X_test, X_val), (y_train, y_test, y_val)) = pickle.load(open(f"{cwd}/data/qiita/qiita_numpy.pkl", "rb"))

## Train Siamese Model

In [7]:
embedding = get_embedding_model()
embedding.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
one_hot_encoding_layer (OneH (None, 152, 4)            0         
_________________________________________________________________
flatten (Flatten)            (None, 608)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               77952     
Total params: 77,952
Trainable params: 77,952
Non-trainable params: 0
_________________________________________________________________


2021-12-17 17:17:02.398255: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-12-17 17:17:02.398666: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Train and Test Siamese Model

data = ((X_train, X_test, X_val), (y_train, y_test, y_val))
dist = DISTANCE_METRICS['EUCLIDEAN']

model, score, history = train_siamese_model(data, embedding, dist , batch_size=256, epochs=5)

Epoch 1/5
   16/95689 [..............................] - ETA: 11:36 - loss: 152.6012

2021-12-17 17:17:29.937461: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


19679/95689 [=====>........................] - ETA: 9:17 - loss: 42.0217

In [None]:
print(f'Score for Siamese Model using {dist} distance: {score}')

## Visualize Loss 

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()