In [1]:
import numpy as np
import pandas as pd
import random
import shutil
import sys
sys.path.append('..')
import os
import utils

from pathlib import Path
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [12]:
# Paths
FRAGMENTS_DIR = Path('../data/fragments/')
TRAINING_DATA_DIR = Path('811')
FRAGMENTS_RANGE = range(8, 25, 2)
SEED = 42

In [8]:
def split_data(data, train_ratio = 0.8):
    
    train_data, dev_data, test_data = [], [], []
    
    # Ensure all labels found in each set (when possible)
    for label in list(set([i[1] for i in data])):
        label_data = [i for i in data if i[1] == label]
        if len(label_data) < 3: continue
        
        random.Random(SEED).shuffle(label_data)
        
        # Add at least 1 datapoint per set
        train_data += [label_data.pop(0)]
        dev_data += [label_data.pop(0)]
        test_data += [label_data.pop(0)]
        
        train_cutoff = int(len(label_data) * train_ratio)
        dev_ratio = (1-train_ratio)/2
        dev_cutoff = int(len(label_data) * (1-dev_ratio))
        
        train_data += label_data[:train_cutoff]
        dev_data += label_data[train_cutoff:dev_cutoff]
        test_data += label_data[dev_cutoff:]
        
    return train_data, dev_data, test_data


def write_csv(path, data): # Data entries should be formatted as (sequence, label) tuple
    with open(path, "w") as f:
        f.write("sequence,label\n")
        for entry in data:
            f.write(f"{entry[0]},{entry[1]}\n")


def encode_sequence(sequence, residue_map = {'A':0,'U':1,'C':2,'G':3,'I':4,'N':5}):
    seq_array = np.array([residue_map[i] for i in sequence], dtype=int)
    encoded_array = np.zeros((seq_array.size, len(set(residue_map.values()))), dtype=int)
    encoded_array[np.arange(seq_array.size), seq_array] = 1
    return encoded_array


def write_feature_matrices(path, data):
    encoded_matrices = [encode_sequence(i[0]) for i in data]
    np.savez(path, np.array(encoded_matrices))


def write_labels(path, data):
    labels = [i[1] for i in data]
    np.save(path, labels)

In [4]:
# Load fragment data
for i in FRAGMENTS_RANGE:
    globals()[f"fragments_{i}"] = utils.load(FRAGMENTS_DIR/f"fragments_{i}.pickle")

In [13]:
for task in ["gnravr", "gnralikevr", "uncgvr", "uncglikevr"]:
    for fragment_length in [8, 10, 12, 14, 16, 18, 20, 22, 24]: # 8, 10, 12, 14, 16, 18, 20, 22, 24
        print(f"Generating {task} {fragment_length}")
        
        # Create/replace existing data directory
        data_dir = TRAINING_DATA_DIR / f"{task}_{fragment_length}/"
        if data_dir.exists():
            shutil.rmtree(data_dir)
        data_dir.mkdir(parents=True, exist_ok=True)
        
        # Retrieve fragments of corresponding length
        fragments = globals()[f'fragments_{fragment_length}']
        
        # Reformat fragment data depending on the task
        match task:
            case "gnra":
                data = [i for i in fragments if i.clust_id != 0]
                data = [(i.res_seq, 1) if i.clust_id == 1 else (i.res_seq, 0) for i in data]
            case "gnravr":
                data = [(i.res_seq, 1) if i.clust_id == 1 else (i.res_seq, 0) for i in fragments]
            case "gnralike":
                data = [i for i in fragments if i.clust_id != 0]
                data = [(i.res_seq, 1) if i.clust_id in [1, 3, 6, 9, 25, 26, 36, 40] else (i.res_seq, 0) for i in data]
            case "gnralikevr":
                data = [(i.res_seq, 1) if i.clust_id in [1, 3, 6, 9, 25, 26, 36, 40] else (i.res_seq, 0) for i in fragments]
            case "uncg":
                data = [i for i in fragments if i.clust_id != 0]
                data = [(i.res_seq, 1) if i.clust_id == 2 else (i.res_seq, 0) for i in data]
            case "uncgvr":
                data = [(i.res_seq, 1) if i.clust_id == 2 else (i.res_seq, 0) for i in fragments]
            case "uncglike":
                data = [i for i in fragments if i.clust_id != 0]
                data = [(i.res_seq, 1) if i.clust_id in [2, 5, 37, 44] else (i.res_seq, 0) for i in data]
            case "uncglikevr":
                data = [(i.res_seq, 1) if i.clust_id in [2, 5, 37, 44] else (i.res_seq, 0) for i in fragments]
            case "tloop":
                data = [(i.res_seq, 0) if i.clust_id == 0 else (i.res_seq, 1) for i in fragments]
            case "folds":
                folds = {"Decoy": [0], "GNRA": [1, 3, 6, 9, 25, 26, 36, 40], "UNCG": [2, 5, 37, 44], "U-TURN": [4], "7": [7], "8": [8], "10": [10], "4-Stack": [11], "12/34": [12, 34], "13/20": [13, 20], "14/19": [14, 19], "15": [15], "16": [16], "17": [17], "18": [18], "RNYA": [21], "22/32/43": [22, 32, 43], "23": [23], "24": [24], "27": [27], "28": [28], "29": [29], "GGUG": [30], "31": [31], "33": [33], "CUUG": [35, 38], "AGNN": [39], "41": [41], "42": [42]} # Clusters corresponding to the same fold, as defined by Bottaro
                data = []
                for idx, (fold, clust_ids) in enumerate(folds.items()):
                    for clust_id in clust_ids:
                        data += [(i.res_seq, idx) for i in fragments if i.clust_id == clust_id]
        
        # Homology reduction
        data = list(set(data))
        
        # Split data into train, dev, and test sets
        train_data, dev_data, test_data = split_data(data)
        
        # Save datasets
        write_csv(data_dir/"train.csv", train_data)
        write_feature_matrices(data_dir/"train_matrices.npz", train_data)
        write_labels(data_dir/"train_labels.npy", train_data)
        
        write_csv(data_dir/"dev.csv", dev_data)
        write_feature_matrices(data_dir/"dev_matrices.npz", dev_data)
        write_labels(data_dir/"dev_labels.npy", dev_data)
        
        write_csv(data_dir/"test.csv", test_data)
        write_feature_matrices(data_dir/"test_matrices.npz", test_data)
        write_labels(data_dir/"test_labels.npy", test_data)

Generating gnravr 8
Generating gnravr 10
Generating gnravr 12
Generating gnravr 14
Generating gnravr 16
Generating gnravr 18
Generating gnravr 20
Generating gnravr 22
Generating gnravr 24
Generating gnralikevr 8
Generating gnralikevr 10
Generating gnralikevr 12
Generating gnralikevr 14
Generating gnralikevr 16
Generating gnralikevr 18
Generating gnralikevr 20
Generating gnralikevr 22
Generating gnralikevr 24
Generating uncgvr 8
Generating uncgvr 10
Generating uncgvr 12
Generating uncgvr 14
Generating uncgvr 16
Generating uncgvr 18
Generating uncgvr 20
Generating uncgvr 22
Generating uncgvr 24
Generating uncglikevr 8
Generating uncglikevr 10
Generating uncglikevr 12
Generating uncglikevr 14
Generating uncglikevr 16
Generating uncglikevr 18
Generating uncglikevr 20
Generating uncglikevr 22
Generating uncglikevr 24
