In [25]:
import re

# This notebook demonstrates how to add arbitrary numerical values from a csv file to an existing embedding file
# We will use embeddings on a sequence level (reduced embeddings)
# We concatenate to existing one_hot_encoding embeddings
# The sequences have ids [Seq1, Seq2, Seq3, Seq4], like the example for sequence_to_class
# TODO!!

import h5py
from Bio import SeqIO

import nt_augment
import numpy as np

In [26]:
# Load existing reduced embedding files
reduced_embeddings_one_hot_path = "../example_files/reduced_embeddings_file_one_hot_encoding.h5"
reduced_embeddings_one_hot_file = h5py.File(reduced_embeddings_one_hot_path, 'r', rdcc_nbytes=1024 ** 2 * 4000,
                                            rdcc_nslots=1e7)

In [27]:
# Read FASTA file with AA sequences
sequences_path = "../example_files/sequences.fasta"
fasta_result = list(SeqIO.parse(sequences_path, "fasta"))
id_to_seq_and_target_dict = {}
for sequence in fasta_result:
    for key, value in re.findall(r"([A-Z_]+)=(-?[A-z0-9]+[.0-9]*)", sequence.description):
        if key == "TARGET":
            id_to_seq_and_target_dict[str(sequence.id)] = (sequence.seq, value)

print(id_to_seq_and_target_dict)

{'Seq1': (Seq('SEQWENCE'), 'Glob'), 'Seq2': (Seq('PRTEIN'), 'GlobSP'), 'Seq3': (Seq('SEQVENCEPRTEI'), 'TM'), 'Seq4': (Seq('PRTEINSEQWENCE'), 'TMSP')}


In [32]:
# Read FASTA content for writing later
seq_lines_dict = {}
with open(sequences_path, "r") as fasta_file:
    lines = fasta_file.readlines()
    for i, line in enumerate(lines):
        if ">" in line:
            seq_id = line.split(" ")[0].replace(">", "")
            seq_lines_dict[seq_id] = (line, lines[i+1] if lines[i+1][-1] == "\n" else lines[i+1] + "\n")

In [33]:
aug_factor = 3
nt_result, _ = nt_augment.aa_to_nt(id_to_seq_and_target_dict, aug_factor=3)
print(nt_result)
"""
data = {}
data['aaseq'] = sequences
data['target'] = targets
result = nt_augment.nt_augmentation(data, final_data_len=20)
print(result)
"""

{'Seq1': ['TCTGAACAATGGGAAAATTGTGAA', 'TCTGAACAATGGGAAAATTGTGAG', 'TCTGAACAATGGGAAAATTGCGAA'], 'Seq2': ['CCTCGTACTGAAATTAAT', 'CCTCGTACTGAAATTAAC', 'CCTCGTACTGAAATCAAT'], 'Seq3': ['TCTGAACAAGTTGAAAATTGTGAACCTCGTACTGAAATT', 'TCTGAACAAGTTGAAAATTGTGAACCTCGTACTGAAATC', 'TCTGAACAAGTTGAAAATTGTGAACCTCGTACTGAAATA'], 'Seq4': ['CCTCGTACTGAAATTAATTCTGAACAATGGGAAAATTGTGAA', 'CCTCGTACTGAAATTAATTCTGAACAATGGGAAAATTGTGAG', 'CCTCGTACTGAAATTAATTCTGAACAATGGGAAAATTGCGAA']}


"\ndata = {}\ndata['aaseq'] = sequences\ndata['target'] = targets\nresult = nt_augment.nt_augmentation(data, final_data_len=20)\nprint(result)\n"

In [36]:
# Append to embedding
def nucleotide_to_one_hot(nt: str):
    return {'A': [1, 0, 0, 0],
            'C': [0, 1, 0, 0],
            'T': [0, 0, 1, 0],
            'G': [0, 0, 0, 1]}[nt]


def nucleotide_seq_to_vector(nt_seq: str):
    result_list = []
    for nt in nt_seq:
        one_hot = nucleotide_to_one_hot(nt)
        result_list.append(one_hot)
    return np.sum(result_list, axis=0) / len(result_list)


# Add values to new file:
# 1. Create a new file
output_embeddings_path = "enhanced_nucleotide_embeddings.h5"
output_sequences_path = "enhanced_sequences.fasta"
# 2. Get embedding dimension
one_hot_dim = reduced_embeddings_one_hot_file["0"].shape[0]

with h5py.File(output_embeddings_path, "w") as output_embeddings_file, open(output_sequences_path, "w") as output_sequences_file:
    # 3. Save one_hot_encoding values in new file with extended shape
    augment_id = 0
    for idx, embedding in reduced_embeddings_one_hot_file.items():
        for nt in range(aug_factor):
            original_sequence_id = reduced_embeddings_one_hot_file[idx].attrs["original_id"]
            nt_appendix = nucleotide_seq_to_vector(nt_result[original_sequence_id][nt])
            appendix_dim = len(nt_appendix)  # 4
            output_embeddings_file.create_dataset(str(augment_id), data=embedding, compression="gzip", chunks=True,
                                                  maxshape=(one_hot_dim + appendix_dim))
            output_embeddings_file[str(augment_id)].resize((one_hot_dim + appendix_dim), axis=0)
            output_embeddings_file[str(augment_id)][-appendix_dim:] = nt_appendix
            augmented_sequence_id = original_sequence_id + "I" * (nt + 1)
            output_embeddings_file[str(augment_id)].attrs["original_id"] = augmented_sequence_id
            header = seq_lines_dict[original_sequence_id][0].replace(original_sequence_id, augmented_sequence_id)
            seq = seq_lines_dict[original_sequence_id][1]
            output_sequences_file.write(header)
            output_sequences_file.write(seq)
            augment_id += 1

In [37]:
# Verify merged file
combined_embeddings_file = h5py.File(output_embeddings_path, 'r', rdcc_nbytes=1024 ** 2 * 4000,
                                     rdcc_nslots=1e7)
print(combined_embeddings_file)
# Show embeddings in internal biotrainer format
id2emb = {combined_embeddings_file[idx].attrs["original_id"]: embedding for (idx, embedding) in
          combined_embeddings_file.items()}
print("{ID: Embedding} in biotrainer format:\n", id2emb)

combined_embeddings_file.close()

<HDF5 file "enhanced_nucleotide_embeddings.h5" (mode r)>
{ID: Embedding} in biotrainer format:
 {'Seq3I': <HDF5 dataset "0": shape (25,), type "<f4">, 'Seq3II': <HDF5 dataset "1": shape (25,), type "<f4">, 'Seq2II': <HDF5 dataset "10": shape (25,), type "<f4">, 'Seq2III': <HDF5 dataset "11": shape (25,), type "<f4">, 'Seq3III': <HDF5 dataset "2": shape (25,), type "<f4">, 'Seq4I': <HDF5 dataset "3": shape (25,), type "<f4">, 'Seq4II': <HDF5 dataset "4": shape (25,), type "<f4">, 'Seq4III': <HDF5 dataset "5": shape (25,), type "<f4">, 'Seq1I': <HDF5 dataset "6": shape (25,), type "<f4">, 'Seq1II': <HDF5 dataset "7": shape (25,), type "<f4">, 'Seq1III': <HDF5 dataset "8": shape (25,), type "<f4">, 'Seq2I': <HDF5 dataset "9": shape (25,), type "<f4">}
