In [9]:
# This notebook demonstrates how to apply nucleotide augmentation to existing sequence embeddings
# Original paper: https://www.biorxiv.org/content/10.1101/2022.03.08.483422v1.full.pdf

# We will use embeddings on a sequence level (reduced embeddings)
# We concatenate to existing one_hot_encoding embeddings
# The sequences have ids [Seq1, Seq2, Seq3, Seq4], like the example for sequence_to_class
# The resulting sequences will be flattened (taking the mean) into one vector, e.g. [0,0,0,1]+[1,0,0,0]=[0.5,0,0,0.5]

import h5py

from biotrainer.input_files import read_FASTA
from nt_augment import nucleotide_seq_to_unigram_vector, nucleotide_seq_to_trigram_vector, aa_to_nt

In [10]:
# Set hyperparameters for whole file
# Number of generated nucleotide seqs (aug_factor * train_seqs + val_seqs + test_seqs = n_new_seqs)
aug_factor = 3
# nucleotide_seq_to_unigram_vector for unigrams (len 4), nucleotide_seq_to_trigram_vector for trigrams (len 64)
ngram_function = [nucleotide_seq_to_unigram_vector, nucleotide_seq_to_trigram_vector][0]

In [11]:
# Load existing reduced embedding files
reduced_embeddings_one_hot_path = "../example_files/reduced_embeddings_file_one_hot_encoding.h5"
reduced_embeddings_one_hot_file = h5py.File(reduced_embeddings_one_hot_path, 'r', rdcc_nbytes=1024 ** 2 * 4000,
                                            rdcc_nslots=1e7)

In [12]:
# Read FASTA file with AA sequences
sequences_path = "../example_files/sequences.fasta"
fasta_result = read_FASTA(sequences_path)
id_to_seq_and_target_dict = {seq_record.seq_id: (seq_record.seq, seq_record.get_target()) for seq_record in fasta_result}
val_or_test_seq_ids = [seq_record.seq_id for seq_record in fasta_result if seq_record.get_set() in ["val", "test"] or seq_record.attributes.get("VALIDATION", False)]

In [13]:
# Read FASTA content to write a new FASTA file later
seq_lines_dict = {}
with open(sequences_path, "r") as fasta_file:
    lines = fasta_file.readlines()
    for i, line in enumerate(lines):
        if ">" in line:
            seq_id = line.split(" ")[0].replace(">", "")
            seq_lines_dict[seq_id] = (line, lines[i+1] if lines[i+1][-1] == "\n" else lines[i+1] + "\n")

In [14]:
nt_result, _ = aa_to_nt(id_to_seq_and_target_dict, aug_factor=aug_factor)
print(nt_result)

{'Seq1': ['TCTGAACAATGGGAAAATTGTGAA', 'TCTGAACAATGGGAAAATTGTGAG', 'TCTGAACAATGGGAAAATTGCGAA'], 'Seq2': ['CCTCGTACTGAAATTAAT', 'CCTCGTACTGAAATTAAC', 'CCTCGTACTGAAATCAAT'], 'Seq3': ['TCTGAACAAGTTGAAAATTGTGAACCTCGTACTGAAATT', 'TCTGAACAAGTTGAAAATTGTGAACCTCGTACTGAAATC', 'TCTGAACAAGTTGAAAATTGTGAACCTCGTACTGAAATA'], 'Seq4': ['CCTCGTACTGAAATTAATTCTGAACAATGGGAAAATTGTGAA', 'CCTCGTACTGAAATTAATTCTGAACAATGGGAAAATTGTGAG', 'CCTCGTACTGAAATTAATTCTGAACAATGGGAAAATTGCGAA']}


In [15]:
# Append nucleotide vector to existing embedding
# Add values to new file:
# 1. Create a new file
output_embeddings_path = "enhanced_nucleotide_embeddings.h5"
output_sequences_path = "enhanced_sequences.fasta"
# 2. Get embedding dimension
one_hot_dim = reduced_embeddings_one_hot_file["0"].shape[0]

with h5py.File(output_embeddings_path, "w") as output_embeddings_file, open(output_sequences_path, "w") as output_sequences_file:
    augment_id = 0
    for idx, embedding in reduced_embeddings_one_hot_file.items():
        original_sequence_id = reduced_embeddings_one_hot_file[idx].attrs["original_id"]
        # Only augment train sequences aug_factor-times
        # Note that Val and Test sequences also get the extended embedding (but only for one sequence)
        add_n_times = aug_factor if original_sequence_id not in val_or_test_seq_ids else 1
        for nt in range(add_n_times):
            # 3. Calculate flattened nucleotide ngram vector
            nt_appendix = ngram_function(nt_result[original_sequence_id][nt])
            appendix_dim = len(nt_appendix)  # 4
            # 4. Save one_hot_encoding values in new file with extended shape
            output_embeddings_file.create_dataset(str(augment_id), data=embedding, compression="gzip", chunks=True,
                                                  maxshape=(one_hot_dim + appendix_dim))
            # 5. Append the ngram vector
            output_embeddings_file[str(augment_id)].resize((one_hot_dim + appendix_dim), axis=0)
            output_embeddings_file[str(augment_id)][-appendix_dim:] = nt_appendix
            # 6. Set new original sequence id (e.g. Seq1 -> Seq1I)
            augmented_sequence_id = original_sequence_id + "I" * (nt + 1)
            output_embeddings_file[str(augment_id)].attrs["original_id"] = augmented_sequence_id
            # 7. Write sequence with augmented_sequence_id to new fasta file
            header = seq_lines_dict[original_sequence_id][0].replace(original_sequence_id, augmented_sequence_id)
            seq = seq_lines_dict[original_sequence_id][1]
            output_sequences_file.write(header)
            output_sequences_file.write(seq)

            augment_id += 1

In [16]:
# Verify new file
combined_embeddings_file = h5py.File(output_embeddings_path, 'r')

# Show embeddings in internal biotrainer format
id2emb = {combined_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          combined_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

combined_embeddings_file.close()

{ID: Embedding} in biotrainer format:
 {'Seq3I': <HDF5 dataset "0": shape (25,), type "<f4">, 'Seq4I': <HDF5 dataset "1": shape (25,), type "<f4">, 'Seq1I': <HDF5 dataset "2": shape (25,), type "<f4">, 'Seq1II': <HDF5 dataset "3": shape (25,), type "<f4">, 'Seq1III': <HDF5 dataset "4": shape (25,), type "<f4">, 'Seq2I': <HDF5 dataset "5": shape (25,), type "<f4">}
