In [1]:
# This notebook demonstrates how to concatenate two embeddings.
# At first, we will use embeddings on a sequence level (reduced embeddings), secondly we will concatenate embeddings on a per-residue level.
# We concatenate one_hot_encoding and word2vec embeddings.

import h5py

In [2]:
"""I. SEQUENCE LEVEL EMBEDDINGS (reduced)"""
# The sequences have ids [Seq1, Seq2, Seq3, Seq4], like the example for sequence_to_class

'I. SEQUENCE LEVEL EMBEDDINGS (reduced)'

In [3]:
# Load existing reduced embedding files
reduced_embeddings_one_hot_path = "../example_files/reduced_embeddings_file_one_hot_encoding.h5"
reduced_embeddings_word2vec_path = "../example_files/reduced_embeddings_file_word2vec.h5"

reduced_embeddings_one_hot_file = h5py.File(reduced_embeddings_one_hot_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)
reduced_embeddings_word2vec_file = h5py.File(reduced_embeddings_word2vec_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)

In [4]:
# Merge embedding files:
# 1. Create a new file
reduced_output_embeddings_path = "reduced_one_hot_and_word2vec_embeddings.h5"
# 2. Get embedding dimensions to merge
one_hot_dim = reduced_embeddings_one_hot_file["0"].shape[0]
word2vec_dim = reduced_embeddings_word2vec_file["0"].shape[0]

with h5py.File(reduced_output_embeddings_path, "w") as reduced_output_embeddings_file:
    # 3. Save one_hot_encoding values in new file with extended shape
    for idx, embedding in reduced_embeddings_one_hot_file.items():
        reduced_output_embeddings_file.create_dataset(idx, data=embedding, compression="gzip", chunks=True,
                                      maxshape=(one_hot_dim + word2vec_dim))
        reduced_output_embeddings_file[idx].attrs["original_id"] = reduced_embeddings_one_hot_file[idx].attrs["original_id"]

    # 4. Append word2vec embeddings
    for idx, embedding in reduced_output_embeddings_file.items():
        appendix = reduced_embeddings_word2vec_file[idx]
        reduced_output_embeddings_file[idx].resize((one_hot_dim + word2vec_dim), axis=0)
        reduced_output_embeddings_file[idx][-appendix.size:] = appendix



In [5]:
# Verify merged file
reduced_combined_embeddings_file = h5py.File(reduced_output_embeddings_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)

for idx, embedding in reduced_combined_embeddings_file.items():
    assert embedding.shape[0] == one_hot_dim + word2vec_dim, "New dimension is not correct"
    assert not (embedding[:one_hot_dim] - reduced_embeddings_one_hot_file[idx]).all(), "One_hot_encodings not correctly merged"
    assert not (embedding[one_hot_dim:] - reduced_embeddings_word2vec_file[idx]).all(), "Word2vec not correctly merged"

# Show embeddings in internal biotrainer format
id2emb = {reduced_combined_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          reduced_combined_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

reduced_combined_embeddings_file.close()

{ID: Embedding} in biotrainer format:
 {'Seq3': <HDF5 dataset "0": shape (533,), type "<f4">, 'Seq4': <HDF5 dataset "1": shape (533,), type "<f4">, 'Seq1': <HDF5 dataset "2": shape (533,), type "<f4">, 'Seq2': <HDF5 dataset "3": shape (533,), type "<f4">}


In [6]:
"""II. RESIDUE LEVEL EMBEDDINGS """
# The sequences have ids [Seq1, Seq2, Seq3], like the example for residue_to_class

'II. RESIDUE LEVEL EMBEDDINGS '

In [7]:
# Load existing embedding files
embeddings_one_hot_path = "../example_files/embeddings_file_one_hot_encoding.h5"
embeddings_word2vec_path = "../example_files/embeddings_file_word2vec.h5"

embeddings_one_hot_file = h5py.File(embeddings_one_hot_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)
embeddings_word2vec_file = h5py.File(embeddings_word2vec_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)

In [8]:
# Merge embedding files:
# 1. Create a new file
output_embeddings_path = "one_hot_and_word2vec_embeddings.h5"
# 2. Get embedding dimensions to merge
one_hot_dim = embeddings_one_hot_file["0"].shape[1]
word2vec_dim = embeddings_word2vec_file["0"].shape[1]

with h5py.File(output_embeddings_path, "w") as output_embeddings_file:
    # 3. Save one_hot_encoding values in new file with extended shape
    for idx, embedding in embeddings_one_hot_file.items():
        sequence_length = embedding.shape[0]
        output_embeddings_file.create_dataset(idx, data=embedding, compression="gzip", chunks=True,
                                      maxshape=(sequence_length, one_hot_dim + word2vec_dim))
        output_embeddings_file[idx].attrs["original_id"] = embeddings_one_hot_file[idx].attrs["original_id"]

    # 4. Append word2vec embeddings
    for idx, embedding in output_embeddings_file.items():
        output_embeddings_file[idx].resize((one_hot_dim + word2vec_dim), axis=1)
        for residue in range(output_embeddings_file[idx].shape[0]):
            appendix = embeddings_word2vec_file[idx][residue]
            output_embeddings_file[idx][residue, -appendix.size:] = appendix


In [9]:
# Verify merged file
combined_embeddings_file = h5py.File(output_embeddings_path, 'r', rdcc_nbytes=1024 ** 2 * 4000, rdcc_nslots=1e7)

for idx, embedding in combined_embeddings_file.items():
    assert embedding.shape[1] == one_hot_dim + word2vec_dim, "New dimension is not correct"
    for residue in range(embedding.shape[0]):
        assert not (embedding[residue][:one_hot_dim] - embeddings_one_hot_file[idx][residue]).all(), "One_hot_encodings not correctly merged"
        assert not (embedding[residue][one_hot_dim:] - embeddings_word2vec_file[idx][residue]).all(), "Word2vec not correctly merged"

# Show embeddings in internal biotrainer format
id2emb = {combined_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          combined_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

combined_embeddings_file.close()

{ID: Embedding} in biotrainer format:
 {'Seq3': <HDF5 dataset "0": shape (14, 533), type "<f4">, 'Seq1': <HDF5 dataset "1": shape (8, 533), type "<f4">, 'Seq2': <HDF5 dataset "2": shape (6, 533), type "<f4">}
