In [4]:
import numpy as np
import pandas as pd
import random
import shutil
import sys
sys.path.append('..')
import utils

from pathlib import Path

In [5]:
# Paths
fragments_dir = Path('../data/fragments/')

# Load fragment data
fragments_8 = utils.load(fragments_dir/'fragments_8_filtered.pickle')
fragments_10 = utils.load(fragments_dir/'fragments_10_filtered.pickle')
fragments_12 = utils.load(fragments_dir/'fragments_12_filtered.pickle')
fragments_14 = utils.load(fragments_dir/'fragments_14_filtered.pickle')

In [3]:
# all = len(entries)
# decoys = len([i for i in entries if i.clust_id == 0])
# tloops = len([i for i in entries if i.clust_id != 0])
# print(f"All: {all}\tTetraloops: {tloops} ({tloops/all*100:.4})\tDecoys: {decoys} ({decoys/all*100:.4})")

All: 4991	Tetraloops: 4991 (100.0)	Decoys: 0 (0.0)


In [6]:
def write_csv(path, data): # Data entries should be formatted as (sequence, label) tuple
    with open(path, "w") as f:
        f.write("sequence,label\n")
        for entry in data:
            f.write(f"{entry[0]},{entry[1]}\n")

In [7]:
def split_data(data, train_ratio = 0.8):
    train_cutoff = int(len(data) * train_ratio)
    dev_ratio = (1-train_ratio)/2
    dev_cutoff = int(len(data) * (1-dev_ratio))
    
    train_data = data[:train_cutoff]
    dev_data = data[train_cutoff:dev_cutoff]
    test_data = data[dev_cutoff:]
    return train_data, dev_data, test_data

In [17]:
# # DNABERT2 finetuning CSV generation
# for task in ["gnra"]:
#     for fragment_length in [8, 10, 12, 14]:
#         for train_ratio in [0.8]:
#             for nucleotides in ["T", "U"]:
#                 data_dir = Path(f"{task}_{fragment_length}_{int(train_ratio*100)}_{nucleotides}/")
#                 if data_dir.exists():
#                     shutil.rmtree(data_dir)
#                 data_dir.mkdir(parents=True, exist_ok=True)
                
#                 # fragment_length
#                 fragments = {8: fragments_8, 10: fragments_10, 12: fragments_12, 14: fragments_14}[fragment_length]

#                 # task
#                 if task == "gnra":
#                     data = [i for i in fragments if i.clust_id != 0]
#                     data = [(i.res_seq, 1) if i.clust_id == 1 else (i.res_seq, 0) for i in fragments]
#                 elif task == "all":
#                     data = [(i.res_seq, i.clust_id) for i in fragments]
#                 random.shuffle(data)
                
#                 # nucleotides
#                 if nucleotides == "T":
#                     data = [(i.replace("U","T"), j) for i, j in data]
                
#                 # split_data(train_ratio)
#                 train_data, dev_data, test_data = split_data(data, train_ratio)

#                 write_csv(data_dir/"train.csv", train_data)
#                 write_csv(data_dir/"dev.csv", dev_data)
#                 write_csv(data_dir/"test.csv", test_data)

In [None]:
# ANN NPZ/NPY data generation
# TODO this it the .npz data generation

# Make train and test set.
test_matrices = []; test_labels = []
dev_matrices = []; dev_labels = []
train_matrices = []; train_labels = []

#read file
all_loops = pd.read_csv('14_Data/14_filtered_rna_sequences_with_cluster0.csv', sep=",")

#remove "," from the RNA sequence
all_loops['RNA_Sequence'] = all_loops['RNA_Sequence'].str.replace(',', '')

#remove non-alphabeticals
#all_loops = all_loops[all_loops['values'].str.isalpha()]
#print(all_loops.head(20))

#exclude cluster 0
all_loops = all_loops[all_loops['Cluster_ID'] != 0]
print(all_loops.tail(20))
#all_loops.to_csv('14_Data/14_filtered_rna_sequences_without_cluster0.csv')

#label all GRNA as 1 and others as 0
all_loops['Label'] = all_loops['Cluster_ID'].apply(lambda x: 1 if x == 1 else 0)
print(all_loops.head(20))

#convert RNA_sequence and Label to lists
seq14_list = all_loops['RNA_Sequence'].values.tolist()
label_list = all_loops['Label'].values.tolist()

#create training and testing dataset with annotation
def make_noAnno_seqmatrix_one_hot(seq_list, labels):
    matrices_list = []
    labels_list = []

    for seq14, label in zip(seq_list, labels):
        seqmatrix_one_hot = np.zeros([14, 4])
        for i in range(14):
            if seq14[i] == 'A':
                seqmatrix_one_hot[i, 0] = 1
            elif seq14[i] == 'U':
                seqmatrix_one_hot[i, 1] = 1
            elif seq14[i] == 'C':
                seqmatrix_one_hot[i, 2] = 1
            elif seq14[i] == 'G':
                seqmatrix_one_hot[i, 3] = 1
            else:
                print(seq14[i])

        matrices_list.append(seqmatrix_one_hot)
        labels_list.append(label)

    combined = list(zip(matrices_list, labels_list))
    random.shuffle(combined)
    matrices_shuffled, labels_shuffled = zip(*combined)

    test_matrices = []
    test_labels = []
    train_matrices = []
    train_labels = []

    for i, (matrix, label) in enumerate(zip(matrices_shuffled, labels_shuffled)):
        if i % 3 == 0:
            test_matrices.append(matrix)
            test_labels.append(label)
        else:
            train_matrices.append(matrix)
            train_labels.append(label)

    return test_matrices, test_labels, train_matrices, train_labels

print(len(test_matrices), len(test_labels))
print(len(train_matrices), len(train_labels))

test_matrices, test_labels, train_matrices, train_labels = make_noAnno_seqmatrix_one_hot(seq14_list, label_list)

print(len(test_matrices), len(test_labels))
print(len(train_matrices), len(train_labels))

print(test_labels)
print(set(test_labels))
#print(test_matrices)
print('--------------------')
print(set(train_labels))
print(train_labels)
#print(train_matrices)

np.savez('14_Data/GNRA_Prediction/noAnno_test_14_nucleotide_array_without_cluster0.npz', np.array(test_matrices))
np.savez('14_Data/GNRA_Prediction/noAnno_train_14_nucleotide_array_without_cluster0.npz', np.array(train_matrices))

np.save('14_Data/GNRA_Prediction/noAnno_test_14_nucleotide_labels_without_cluster0.npy', test_labels)
np.save('14_Data/GNRA_Prediction/noAnno_train_14_nucleotide_labels_without_cluster0.npy', train_labels)
