In [39]:
# This notebook demonstrates how to concatenate two embeddings (on sequence level).
# We concatenate one_hot_encoding and word2vec embeddings.
# The sequences have ids [Seq1, Seq2, Seq3, Seq4], like the example for sequence_to_class

import h5py

In [40]:
# Load existing embedding files
embeddings_one_hot_path = "reduced_embeddings_file_one_hot_encoding.h5"
embeddings_word2vec_path = "reduced_embeddings_file_word2vec.h5"

embeddings_one_hot_file = h5py.File(embeddings_one_hot_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)
embeddings_word2vec_file = h5py.File(embeddings_word2vec_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)

In [41]:
# Merge embedding files:
# 1. Create a new file
output_embeddings_path = "reduced_one_hot_and_word2vec_embeddings.h5"
# 2. Get embedding dimensions to merge
one_hot_dim = embeddings_one_hot_file["0"].shape[0]
word2vec_dim = embeddings_word2vec_file["0"].shape[0]

with h5py.File(output_embeddings_path, "w") as output_embeddings_file:
    # 3. Save one_hot_encoding values in new file with extended shape
    for idx, embedding in embeddings_one_hot_file.items():
        output_embeddings_file.create_dataset(idx, data=embedding, compression="gzip", chunks=True,
                                      maxshape=(one_hot_dim + word2vec_dim))
        output_embeddings_file[idx].attrs["original_id"] = embeddings_one_hot_file[idx].attrs["original_id"]

    # 4. Append word2vec embeddings
    for idx, embedding in output_embeddings_file.items():
        appendix = embeddings_word2vec_file[idx]
        output_embeddings_file[idx].resize((one_hot_dim + word2vec_dim), axis=0)
        output_embeddings_file[idx][-appendix.size:] = appendix



In [None]:
# Verify merged file
combined_embeddings_file = h5py.File(output_embeddings_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)

for idx, embedding in combined_embeddings_file.items():
    assert embedding.shape[0] == one_hot_dim + word2vec_dim, "New dimension is not correct"
    assert not (embedding[:one_hot_dim] - embeddings_one_hot_file[idx]).all(), "One_hot_encodings not correctly merged"
    assert not (embedding[one_hot_dim:] - embeddings_word2vec_file[idx]).all(), "Word2vec not correctly merged"

# Show embeddings in internal biotrainer format
id2emb = {combined_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          combined_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

combined_embeddings_file.close()