In [None]:
#download with gdown
import gdown

gdown.download(id = "1rf49ePaYGUJMYmbsSNcYorPtmv5u1mrJ",
               output = "/content/train.data")

gdown.download(id = "1-1WenJet60MW5nQ9Wf90g9xkaVW2stbf",
               output = "/content/validation.data")

gdown.download(id = "1--Qrm6wUNgatIohIogu3JNBUhcGHbs9A",
               output = "/content/test.data")

In [None]:
import numpy as np
from numba import njit
import numba

# Encode k-mer function
@njit
def encode_kmer(kmer):
    nucleotide_to_index = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4}
    kmer_encoded = np.zeros(shape = (12), dtype = numba.boolean)

    for i, nucleotide in enumerate(kmer):
        index = nucleotide_to_index.get(nucleotide, 4)
        if index < 4:
            kmer_encoded[i * 4 + index] = True

    return kmer_encoded

@njit
def encode_sequence(sequence):
    nucleotide_to_index = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4}
    sequence_encoded = np.zeros((101, 4), dtype=numba.boolean)

    for i, nucleotide in enumerate(sequence):
        index = nucleotide_to_index.get(nucleotide, 4)
        if index < 4:
            for j in range(4):
                if j == index:
                    sequence_encoded[i, j] = True

    return sequence_encoded

# Construct De Bruijn graph function
@njit
def construct_de_bruijn_graph(dna_sequence, k):
    kmers = [dna_sequence[i:i + k] for i in range(99)]

    node_features = np.zeros(shape = (99,12), dtype = numba.boolean)

    one_hot_encoded_sequence = np.zeros(shape = (101,4), dtype = numba.boolean)
    one_hot_encoded_sequence = encode_sequence(dna_sequence)

    for i in range(99):
        node_features[i] = encode_kmer(kmers[i])

    adj = np.eye(99, k=1, dtype=numba.boolean) | np.eye(99, k=-1, dtype=numba.boolean)

    return node_features, adj, one_hot_encoded_sequence

train_graphs = []
train_data = []
for i, [seq, label] in enumerate(np.genfromtxt('/content/train.data', delimiter=" ", dtype = str, usecols=(1,2), invalid_raise = False)):
  node_features, adj, encoded_sequences  = construct_de_bruijn_graph(seq, 3)
  train_graphs.append([node_features.astype(bool), adj.astype(bool) ,np.array(label).astype(bool)])
  train_data.append([encoded_sequences.astype(bool), np.array(label).astype(bool)])

validation_graphs = []
validation_data = []
for i, [seq, label] in enumerate(np.genfromtxt('/content/validation.data', delimiter=" ", dtype = str, usecols=(1,2), invalid_raise = False)):
  node_features, adj, encoded_sequences = construct_de_bruijn_graph(seq, 3)
  validation_graphs.append([node_features.astype(bool), adj.astype(bool) ,np.array(label).astype(bool)])
  validation_data.append([encoded_sequences.astype(bool), np.array(label).astype(bool)])

test_graphs = []
test_data = []
for i, [seq, label] in enumerate(np.genfromtxt('/content/test.data', delimiter=" ", dtype = str, usecols=(1,2), invalid_raise = False)):
  node_features, adj, encoded_sequences = construct_de_bruijn_graph(seq, 3)
  test_graphs.append([node_features.astype(bool), adj.astype(bool) ,np.array(label).astype(bool)])
  test_data.append([encoded_sequences.astype(bool), np.array(label).astype(bool)])

In [None]:
node_features, adj, labels = [], [], []

for node_feature, a, label in train_graphs:
    node_features.append(node_feature)
    adj.append(a)
    labels.append(label)

np.savez_compressed('train_graph_data.npz',
                    node_features = node_features,
                    adj = adj,
                    labels = labels)

# ===========
node_features, adj, labels = [], [], []

for node_feature, a, label in validation_graphs:
    node_features.append(node_feature)
    adj.append(a)
    labels.append(label)

np.savez_compressed('validation_graph_data.npz',
                    node_features = node_features,
                    adj = adj,
                    labels = labels)

# ============
node_features, adj, labels = [], [], []

for node_feature, a, label in test_graphs:
    node_features.append(node_feature)
    adj.append(a)
    labels.append(label)

np.savez_compressed('test_graph_data.npz',
                    node_features = node_features,
                    adj = adj,
                    labels = labels)

In [None]:
train_sequences, train_labels = [], []

for train_sequence, label in train_data:
    train_sequences.append(train_sequence)
    train_labels.append(label)

np.savez_compressed('train_data.npz',
                    train_sequences = train_sequences,
                    train_labels = train_labels)

validation_sequences, validation_labels = [], []

for validation_sequence, label in validation_data:
    validation_sequences.append(validation_sequence)
    validation_labels.append(label)

np.savez_compressed('validation_data.npz',
                    validation_sequences = validation_sequences,
                    validation_labels = validation_labels)

test_sequences, test_labels = [], []

for test_sequence, label in test_data:
    test_sequences.append(test_sequence)
    test_labels.append(label)

np.savez_compressed('test_data.npz',
                    test_sequences = test_sequences,
                    test_labels = test_labels)