In [1]:
import numpy as np
import pandas as pd
import random
import shutil
import sys
sys.path.append('..')
import os
import utils

from pathlib import Path

In [2]:
# Paths
fragments_dir = Path('../data/fragments/')
training_data_dir = Path('sample_data')

In [24]:
def split_data(data, train_ratio = 0.8): 
    
    train_data, dev_data, test_data = [], [], []
    
    # Ensure all labels found in each set (when possible)
    for label in list(set([i[1] for i in data])):
        label_data = [i for i in data if i[1] == label]
        if len(label_data) < 3: continue

        random.shuffle(label_data)

        # Add at least 1 datapoint per set
        train_data += [label_data.pop(0)]
        dev_data += [label_data.pop(0)]
        test_data += [label_data.pop(0)]
        
        train_cutoff = int(len(label_data) * train_ratio)
        dev_ratio = (1-train_ratio)/2
        dev_cutoff = int(len(label_data) * (1-dev_ratio))
        
        train_data += label_data[:train_cutoff]
        dev_data += label_data[train_cutoff:dev_cutoff]
        test_data += label_data[dev_cutoff:]
        
    return train_data, dev_data, test_data


def write_csv(path, data): # Data entries should be formatted as (sequence, label) tuple
    with open(path, "w") as f:
        f.write("sequence,label\n")
        for entry in data:
            f.write(f"{entry[0]},{entry[1]}\n")


def encode_sequence(sequence, residue_map = {'A':0,'U':1,'T':1,'C':2,'G':3,'I':4}): # TODO should I be included?
    seq_array = np.array([residue_map[i] for i in sequence if i in residue_map.keys()])
    encoded_array = np.zeros((seq_array.size, 5), dtype=int)
    encoded_array[np.arange(seq_array.size), seq_array] = 1
    return encoded_array


def write_matrix(path, data):
    encoded_matrices = [encode_sequence(i[0]) for i in data]
    np.savez(path, np.array(encoded_matrices))


def write_label(path, data):
    labels = [i[1] for i in data]
    np.save(path, labels)

In [13]:
# Load fragment data (single cluster)
fragments_8 = utils.load(fragments_dir/'fragments_8_filtered.pickle')
fragments_10 = utils.load(fragments_dir/'fragments_10_filtered.pickle')
fragments_12 = utils.load(fragments_dir/'fragments_12_filtered.pickle')
fragments_14 = utils.load(fragments_dir/'fragments_14_filtered.pickle')
fragments_16 = utils.load(fragments_dir/'fragments_16_filtered.pickle')
fragments_18 = utils.load(fragments_dir/'fragments_18_filtered.pickle')
fragments_20 = utils.load(fragments_dir/'fragments_20_filtered.pickle')
fragments_22 = utils.load(fragments_dir/'fragments_22_filtered.pickle')
fragments_24 = utils.load(fragments_dir/'fragments_24_filtered.pickle')

In [14]:
for task in ["clusters"]: # "gnra", "clusters", "tloop"
    for fragment_length in [8]: # 8, 10, 12, 14, 16, 18, 20, 22, 24
    #// for train_ratio in [0.8]:
    #// for nucleotides in ["T", "U"]:

        # Create/replace existing data directory
        data_dir = training_data_dir / f"{task}_{fragment_length}/" #//_{int(train_ratio*100)}_{nucleotides}
        if data_dir.exists():
            shutil.rmtree(data_dir)
        data_dir.mkdir(parents=True, exist_ok=True)
        
        # Retrieve fragments of corresponding length
        fragments = globals()[f'fragments_{fragment_length}']

        # Reformat fragment data depending on the task
        match task:
            case "gnra":
                data = [i for i in fragments if i.clust_id != 0]
                data = [(i.res_seq, 1) if i.clust_id == 1 else (i.res_seq, 0) for i in data]
            case "tloop":
                data = [(i.res_seq, 0) if i.clust_id == 0 else (i.res_seq, 1) for i in fragments]
            case "clusters":
                folds = {"GNRA": [1, 3, 6, 9, 25, 26, 36, 40], "UNCG": [2, 5, 37, 44], "U-TURN": [4], "7": [7], "8": [8], "10": [10], "4-Stack": [11], "12/34": [12, 34], "13/20": [13, 20], "14/19": [14, 19], "15": [15], "16": [16], "17": [17], "18": [18], "RNYA": [21], "22/32/43": [22, 32, 43], "23": [23], "24": [24], "27": [27], "28": [28], "29": [29], "GGUG": [30], "31": [31], "33": [33], "CUUG": [35, 38], "AGNN": [39], "41": [41], "42": [42]} # Clusters corresponding to the same fold, as defined by Bottaro
                data = []
                for idx, (fold, clust_ids) in enumerate(folds.items()):
                    for clust_id in clust_ids:
                        data += [(i.res_seq, idx) for i in fragments if i.clust_id == clust_id]
                #// Clusters 23 - 44 lumped into one group under label "23"
                #// data = [(i.res_seq, i.clust_id) if i.clust_id not in range(23, 45) else (i.res_seq, 23) for i in fragments]
            #// case "gnravall":
            #//     data = [i for i in fragments if i.clust_id != 0]
            #//     data = [(i.res_seq, 1) if i.clust_id == 1 else (i.res_seq, 0) for i in fragments]
        
        # Homology reduction
        data = list(set(data))
        
        #// # Replace T with U
        #// if nucleotides == "T":
        #//     data = [(i.replace("U","T"), j) for i, j in data]

        # Split data into train, dev, and test sets
        train_data, dev_data, test_data = split_data(data) #//train_ratio
        
        # Save datasets
        write_csv(data_dir/"train.csv", train_data)
        write_matrix(data_dir/"train_matrices.npz", train_data)
        write_label(data_dir/"train_labels.npy", train_data)

        write_csv(data_dir/"dev.csv", dev_data)
        write_matrix(data_dir/"dev_matrices.npz", dev_data)
        write_label(data_dir/"dev_labels.npy", dev_data)

        write_csv(data_dir/"test.csv", test_data)
        write_matrix(data_dir/"test_matrices.npz", test_data)
        write_label(data_dir/"test_labels.npy", test_data)

In [27]:
# Load fragment data (multiple clusters)
fragments_multi_8 = utils.load(fragments_dir/'fragments_multi_8_filtered.pickle')
fragments_multi_24 = utils.load(fragments_dir/'fragments_multi_24_filtered.pickle')
fragments_multi_48 = utils.load(fragments_dir/'fragments_multi_48_filtered.pickle')

In [28]:
for fragment_length in [8, 24, 48]: # 8, 10, 12, 14, 16, 18, 20, 22, 24

    # Create/replace existing data directory
    data_dir = training_data_dir / f"multi_{fragment_length}/"
    if data_dir.exists():
        shutil.rmtree(data_dir)
    data_dir.mkdir(parents=True, exist_ok=True)
    
    # Retrieve fragments of corresponding length
    fragments = globals()[f'fragments_multi_{fragment_length}']
    
    data = [(i.res_seq, i.clust_id) for i in fragments]
    
    # Homology reduction
    data = list(set(data))
    
    # Split data into train, dev, and test sets
    train_data, dev_data, test_data = split_data(data)
    
    # Save datasets
    write_csv(data_dir/"train.csv", train_data)
    write_matrix(data_dir/"train_matrices.npz", train_data)
    write_matrix(data_dir/"train_labels.npz", train_data)

    write_csv(data_dir/"dev.csv", dev_data)
    write_matrix(data_dir/"dev_matrices.npz", dev_data)
    write_matrix(data_dir/"dev_labels.npz", dev_data)

    write_csv(data_dir/"test.csv", test_data)
    write_matrix(data_dir/"test_matrices.npz", test_data)
    write_matrix(data_dir/"test_labels.npz", test_data)