# Embedding distance thresholds

The idea here is to train the siamese network using different datasets consisting of all but one tortoise, then seeing how far the embedding of that tortoise is from the images which the network was trained on. This should give an idea of what the threshold should be for determining a tortoise is new, as opposed to one we have encountered before.

In [None]:
turtles = [None]
for turtle in turtles:
    # make dataset without turtle
    # train siamese network to embed and distinguish turtles
    # get the network's embedding of the excluded turtle
    # store distance of excluded turtle's embedding to all other embeddings
    pass
# plot histogram of embeddings of each turtle when each was excluded from the training dataset

## Process for one dataset

create model (from base model, MobileNetV2)
generate dataset
configure learning rate, optimizer, loss function
compile model
configure checkpoints, early stopping, patience
fit model and save history
use model to predict dataset
calculate embedding distance on omitted class
save embedding distance

need to write a script to do the above, then save it to a module

In [20]:
from exclude_tortoise import *
from model.siamese.model_generator import create_model, base_models
import tensorflow as tf
import tensorflow_addons as tfa
from tqdm.keras import TqdmCallback
from data.data_generator import DataGenerator
from model.siamese.config import cfg
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

In [4]:
# Delete 'turtles_except_0' before testing
TURTLE_NUMBER = 0  # 0 through 26
ALL_IMAGES = 'C:/Users/robir/OneDrive/Documents/GitHub/csci-e-599/siamese-animal-tracking/data/filter_aug'

dataset_folder = exclude_tortoise(ALL_IMAGES, TURTLE_NUMBER)
train_data_folder = dataset_folder + "/train"
test_data_folder = dataset_folder + "/test"

TRAINABLE = True

base_model = list(base_models.keys())[0]  # MobileNetV2, ResNet101V2, EfficientNetB5

WEIGHTS_DIR = "model/siamese/weights"
datatype = 'train'

In [5]:
model = create_model(trainable=TRAINABLE, base_model=base_model)

In [11]:
try:
    tf.keras.utils.plot_model(
        model,
        to_file=f"assets/{base_model}_model_fig.png",
        show_shapes=True,
        expand_nested=True,
    )
except ImportError as e:
    print(f"Failed to plot keras model: {e}")

ds_generator = DataGenerator(
    file_ext=["png", "jpg"],
    folder_path=train_data_folder,
    exclude_aug=True,
    step_size=1
)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
Found 92 files for 23 unique classes


In [14]:
learning_rate = cfg.TRAIN.LEARNING_RATE
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) #DLiske the orig code had lr instead of learning_rate
loss_func = tfa.losses.TripletSemiHardLoss() # DLiske comment only: The triplets are generated by TF

model.compile(loss=loss_func, optimizer=optimizer, metrics=[])

In [16]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    WEIGHTS_DIR + "/" + base_model + "/siam-{epoch}-"+str(learning_rate)+"-"+"_{loss:.4f}.h5",
    monitor="loss",
    verbose=0,
    save_best_only=True,
    save_weights_only=True,
    mode="min",
)
stop = tf.keras.callbacks.EarlyStopping(monitor="loss", patience=cfg.TRAIN.PATIENCE, mode="min", restore_best_weights=True)

In [24]:
logdir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

history = model.fit(
    ds_generator,
    #epochs=cfg.TRAIN.EPOCHS,
    epochs = 75,
    callbacks=[tensorboard_callback, checkpoint, TqdmCallback(verbose=0), stop],
    verbose=1
)

0epoch [00:00, ?epoch/s]

ImportError: Could not import PIL.Image. The use of `load_img` requires PIL.

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
#plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
dataset = ds_generator.get_dataset()
results = model.predict(dataset)

In [None]:
np.savetxt(f"vecs-{datatype}-{base_model}.tsv", results, delimiter="\t")
out_m = io.open(f"meta-{datatype}-{base_model}.tsv", "w")#, encoding="utf-8")
for img, labels in tfds.as_numpy(dataset):
    [out_m.write(str(x) + "\n") for x in labels]
out_m.close()

In [None]:
# merge all embeddings per class
per_class = {}
idx = 0
for img, labels in tfds.as_numpy(dataset):
    for class_id in labels:
        if class_id not in per_class:
            per_class[class_id] = []
        per_class[class_id].append(results[idx])
        idx += 1

mean_values = None
labels = None
# calculate average value for each class
for class_id, values in per_class.items():
    #print("CLASS_ID", class_id)#, values)
    matrix = np.array(values)
    mean_val = np.mean(matrix, axis=0)
    if mean_values is None:
        mean_values = np.array([mean_val])
    else:
        mean_values = np.concatenate((mean_values, np.array([mean_val])), axis=0)
    if labels is None:
        labels = np.array([class_id], dtype='U20')
    else:
        labels = np.concatenate((labels, [class_id]), axis=0, dtype='U20')

np.save('/names.npy', labels)

In [None]:
datatype = 'train'
# DLiske (I added b/c it seems like we shouldn't overwrite the avg embeddings if we're doing this on Test...)
if datatype == 'train': # DLiske
    # save avg embedding per class to be used as visualization and for further processing
    np.savetxt(f"vecs-conc-{base_model}.tsv", mean_values, delimiter="\t")
    #np.savetxt(f"meta-conc-{base_model}.tsv", labels, fmt="%i", delimiter="\t")
    np.savetxt(f"meta-conc-{base_model}.tsv", labels, fmt="%s", delimiter="\t") #D LISKE

In [None]:
labels

In [None]:
names