In [1]:
# This notebook demonstrates how to create a new embedding file from existing values in a csv file
# We will use embeddings on a sequence level (reduced embeddings)
# The sequences have ids [Seq1, Seq2, Seq3, Seq4], like the example for sequence_to_class

import h5py
import numpy as np

In [2]:
# Read values from csv
values_dict = {}
with open("../example_files/arbitrary_values.csv", "r") as csv_file:
    lines = csv_file.readlines()[1:]
    for line in lines:
        seq_id = line.split(",")[0]
        values = [float(value) for value in line.split(",")[1].replace("[", "").replace("]", "").split(";")]
        values_dict[seq_id] = values

In [9]:
# Add values to new file:
# 1. Create a new file
output_embeddings_path = "arbitrary_values_embeddings.h5"

with h5py.File(output_embeddings_path, "w") as output_embeddings_file:
    # 2. Save values as dataset in new file
    idx = 0
    for seq_id, values in values_dict.items():
        output_embeddings_file.create_dataset(str(idx), data=np.array(values), compression="gzip", chunks=True,
                                              maxshape=(len(values)))
        output_embeddings_file[str(idx)].attrs["original_id"] = seq_id
        idx += 1

In [10]:
# Verify created file
new_embeddings_file = h5py.File(output_embeddings_path, 'r', rdcc_nbytes=1024 ** 2 * 4000,
                                rdcc_nslots=1e7)

for idx, embedding in new_embeddings_file.items():
    original_sequence_id = new_embeddings_file[idx].attrs["original_id"]
    appendix = values_dict[original_sequence_id]
    assert embedding.shape[0] == len(appendix), "New dimension is not correct"
    assert not (embedding - np.array(appendix)).all(), "Values not correctly merged"

# Show embeddings in internal biotrainer format
id2emb = {new_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          new_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

new_embeddings_file.close()

{ID: Embedding} in biotrainer format:
 {'Seq1': <HDF5 dataset "0": shape (5,), type "<f8">, 'Seq2': <HDF5 dataset "1": shape (5,), type "<f8">, 'Seq3': <HDF5 dataset "2": shape (5,), type "<f8">, 'Seq4': <HDF5 dataset "3": shape (5,), type "<f8">}
