In [1]:
# This notebook demonstrates how to add arbitrary numerical values from a csv file to an existing embedding file
# We will use embeddings on a sequence level (reduced embeddings)
# We concatenate to existing one_hot_encoding embeddings
# The sequences have ids [Seq1, Seq2, Seq3, Seq4], like the example for sequence_to_class

import h5py
import numpy as np

In [2]:
# Load existing reduced embedding files
reduced_embeddings_one_hot_path = "../example_files/reduced_embeddings_file_one_hot_encoding.h5"
reduced_embeddings_one_hot_file = h5py.File(reduced_embeddings_one_hot_path, 'r', rdcc_nbytes=1024 ** 2 * 4000,
                                            rdcc_nslots=1e7)

In [4]:
# Read values from csv
values_dict = {}
with open("../example_files/arbitrary_values.csv", "r") as csv_file:
    lines = csv_file.readlines()[1:]
    for line in lines:
        seq_id = line.split(",")[0]
        values = [float(value) for value in line.split(",")[1].replace("[", "").replace("]", "").split(";")]
        values_dict[seq_id] = values

In [5]:
# Add values to new file:
# 1. Create a new file
reduced_output_embeddings_path = "enhanced_one_hot_embeddings.h5"
# 2. Get embedding dimension
one_hot_dim = reduced_embeddings_one_hot_file["0"].shape[0]

with h5py.File(reduced_output_embeddings_path, "w") as reduced_output_embeddings_file:
    # 3. Save one_hot_encoding values in new file with extended shape
    for idx, embedding in reduced_embeddings_one_hot_file.items():
        original_sequence_id = reduced_embeddings_one_hot_file[idx].attrs["original_id"]
        appendix_dim = len(values_dict[original_sequence_id])
        reduced_output_embeddings_file.create_dataset(idx, data=embedding, compression="gzip", chunks=True,
                                                      maxshape=(one_hot_dim + appendix_dim))
        reduced_output_embeddings_file[idx].attrs["original_id"] = original_sequence_id

    # 4. Append values from csv file
    for idx, embedding in reduced_output_embeddings_file.items():
        original_sequence_id = reduced_embeddings_one_hot_file[idx].attrs["original_id"]
        appendix = values_dict[original_sequence_id]
        reduced_output_embeddings_file[idx].resize((one_hot_dim + len(appendix)), axis=0)
        reduced_output_embeddings_file[idx][-len(appendix):] = np.array(appendix)


In [7]:
# Verify merged file
reduced_combined_embeddings_file = h5py.File(reduced_output_embeddings_path, 'r', rdcc_nbytes=1024 ** 2 * 4000,
                                             rdcc_nslots=1e7)

for idx, embedding in reduced_combined_embeddings_file.items():
    original_sequence_id = reduced_combined_embeddings_file[idx].attrs["original_id"]
    appendix = values_dict[original_sequence_id]
    assert embedding.shape[0] == one_hot_dim + len(appendix), "New dimension is not correct"
    assert not (embedding[one_hot_dim:] - np.array(appendix)).all(), "Values not correctly merged"

# Show embeddings in internal biotrainer format
id2emb = {reduced_combined_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          reduced_combined_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

reduced_combined_embeddings_file.close()

{ID: Embedding} in biotrainer format:
 {'Seq3': <HDF5 dataset "0": shape (26,), type "<f4">, 'Seq4': <HDF5 dataset "1": shape (26,), type "<f4">, 'Seq1': <HDF5 dataset "2": shape (26,), type "<f4">, 'Seq2': <HDF5 dataset "3": shape (26,), type "<f4">}
