In [12]:
from typing import List, Dict, Tuple
import keras
import keras.layers
import keras.utils.all_utils
import keras.callbacks
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def parse_fasta_dataset(file_path: str = "../data/LTP_09_2021_compressed.fasta", max_seqs:int = False) -> List[Dict[str, str]]:
    dataset = []
    current_idx = 0
    current_meta = {}
    with open(file_path, "r") as f:
        for line in f.readlines():
            if line[0] == ">":
                if current_meta != {}:
                    current_meta["sequence"] = current_meta["sequence"].strip()
                    dataset.append(current_meta)
                line_list = line.replace("\n", "").replace(">", "").split("\t")
                if len(line_list) < 2:
                    current_meta  = {key: line_list[idx] for idx, key in enumerate(["name"])}
                elif len(line_list) < 3:
                    current_meta  = {key: line_list[idx] for idx, key in enumerate(["id", "name"])}
                else:
                    current_meta  = {key: line_list[idx] for idx, key in enumerate(["id", "name", "tags"])}
                    current_meta["tags"] = current_meta["tags"].split(";")
                current_meta["sequence"] = ""
                current_idx += 1
                if current_idx > max_seqs and max_seqs>=1:
                    break
            else:
                current_meta["sequence"] += line.replace("\n", " ")
    return dataset

dataset = parse_fasta_dataset(max_seqs=-1)
print(len(dataset))

17959


In [3]:
dataset[0]

{'id': 'AB681979',
 'name': 'Trabulsiella guamensis',
 'tags': ['Bacteria',
  'Proteobacteria',
  'Gammaproteobacteria',
  'Enterobacterales',
  'Enterobacteriaceae',
  'Trabulsiella'],
 'sequence': 'AUUGAACGCU GGCGGCAGGC CUAACACAUG CAAGUCGAGC GGCAGCGGGG GAAAGCUUGC UUUCCCGCCG GCGAGCGGCG GACGGGUGAG UAAUGUCUGG GAAACUGCCU GAUGGAGGGG GAUAACUACU GGAAACGGUA GCUAAUACCG CAUAACGUCU UCGGACCAAA GUGGGGGACC UUCGGGCCUC AUGCCAUCAG AUGUGCCCAG AUGGGAUUAG CUAGUAGGUG GGGUAACGGC UCACCUAGGC GACGAUCCCU AGCUGGUCUG AGAGGAUGAC CAGCCACACU GGAACUGAGA CACGGUCCAG ACUCCUACGG GAGGCAGCAG UGGGGAAUAU UGCACAAUGG GCGCAAGCCU GAUGCAGCCA UGCCGCGUGU AUGAAGAAGG CCUUCGGGUU GUAAAGUACU UUCAGCGGGG AGGAAGGUGU UGUGGUUAAU AACCAGAGCA AUUGACGUUA CCCGCAGAAG AAGCACCGGC UAACUCCGUG CCAGCAGCCG CGGUAAUACG GAGGGUGCAA GCGUUAAUCG GAAUUACUGG GCGUAAAGCG CACGCAGGCG GUCUGUCAAG UCGGAUGUGA AAUCCCCGGG CUCAACCUGG GAACUGCAUC CGAAACUGGC AGGCUUGAGU CUUGUAGAGG GGGGUAGAAU UCCAGGUGUA GCGGUGAAAU GCGUAGAGAU CUGGAGGAAU ACCGGUGGCG AAGGCGGCCC CCUGGACAAA GACUGACG

In [4]:
temp = set("".join(x["sequence"].replace(" ", "") for x in dataset))
encode_dict = {value: idx+1 for idx, value in enumerate(temp)}
X = np.array([np.pad(np.array([encode_dict[z] for z in x["sequence"].replace(" ", "")]), (0, 3000))[:2500] for x in dataset])

In [5]:
temp = {value for x in dataset for value in x["tags"][:3]}
label_encode_dict = {value: idx+1 for idx, value in enumerate(temp)}
Y = keras.utils.all_utils.to_categorical(np.array([[label_encode_dict[z] for z in x["tags"][:3]] for x in dataset]))
Y = np.array([np.sum(x, 0) for x in Y])

In [6]:
Y.shape

(17959, 158)

In [7]:
def build_model(input_shape: Tuple[int], output_shape: Tuple[int], embed_size: int, vocab_size: int, lstm_size: int=32):
    # model input
    model_input = keras.layers.Input(shape=input_shape)

    # embedding layer
    embedding = keras.layers.Embedding(vocab_size, embed_size)(model_input)
    # RNN layer
    rnn_layer = keras.layers.GRU(lstm_size)(embedding)

    # model output
    model_output = keras.layers.Dense(output_shape, activation="relu")(rnn_layer)

    return keras.Model(inputs=[model_input], outputs=[model_output])

In [13]:
model = build_model(input_shape=X[0].shape, output_shape=Y[0].shape[0], embed_size=128, vocab_size=len(encode_dict)+1, lstm_size=32)

model.compile(optimizer="sgd", loss="categorical_crossentropy")

train_hx = model.fit(X[:100], Y[:100], validation_split=0.2, epochs=10, callbacks=[keras.callbacks.TensorBoard()])