# =================== Import ===================

In [None]:
import os.path
import numpy as np
from Bio import SeqIO

# =================== Variables ===================

In [None]:
final_prosite = os.path.abspath("Final_Files/Final_PROSITE.fasta")

# =================== Functions===================

In [None]:
def vectors(final_prosite):
    ''' Function to create an: id_vector, seq_vector, lenght_vector, 
    cluster_vector and label_vector, from the protein data set'''
    
    sequence_prosite = open(final_prosite)
    id_list = []
    seq_list = []
    lenght_list = []    
    for seq_record in SeqIO.parse(sequence_prosite, 'fasta'):
        if len(seq_record.seq) == 13100: # skips the protein if the lenght is 13100
            continue
        seq_id = seq_record.id
        id_list.append(seq_id)   
        seq_ = "%s" % seq_record.seq
        seq_list.append(seq_)        
        seq_len = len(seq_record)
        lenght_list.append(seq_len)        
    cluster_list = [i.split('_')[1] for i in id_list] 
    cluster_list = [int(x) for x in cluster_list]    
    label_list = [i.split('|')[2] for i in id_list]
    label_list = [i.split('_')[0] for i in label_list]     
    # Convert labels to binary output
    label_list_int = []
    for label in label_list:
        if label == 'Negative':
            label_list_int.append(0)
        else:
            label_list_int.append(1)    
    # Turning the lists into arrays
    id_vector = np.asanyarray(id_list)   
    seq_vector = np.asanyarray(seq_list)
    lenght_vector = np.asarray(lenght_list) 
    cluster_vector = np.asarray(cluster_list)
    label_vector = np.asarray(label_list_int)  
    print (max(lenght_vector))
    return id_vector, seq_vector, lenght_vector, cluster_vector, label_vector


def partition_assignment(cluster_vector, label_vector):
    ''' Function to separate proteins into N partitions with balanced classes'''
    n_partitions = 5
    n_class = 2
    # Unique cluster number
    u_cluster = np.unique(cluster_vector)
    # Initialize matrices
    loc_number = np.ones((n_partitions,n_class))
    cl_number = np.zeros(cluster_vector.shape[0])
    for i in u_cluster:
        # Extract the labels for the proteins in that cluster
        positions = np.where(cluster_vector == i)
        cl_labels = label_vector[positions]  
        # Count number of each class
        u, count = np.unique(cl_labels, return_counts=True)       
        u = u.astype(np.int32)
        temp_loc_number = np.copy(loc_number)
        temp_loc_number[:,u] += count
        loc_per = loc_number/temp_loc_number
        best_group = np.argmin(np.sum(loc_per,axis=1))
        loc_number[best_group,u] += count        
        # Store the selected partition
        cl_number[positions] = best_group    
    partition_matrix = loc_number.astype(np.int32)-np.ones(loc_number.shape)
    return cl_number, partition_matrix


def encode_aa(protein):
    ''' Function to One Hot Encoding'''
    aminoacids = 'ARNDCQEGHILKMFPSTWYV'
    aa_list = list(aminoacids)   
    out_protein = []
    for aa in protein:
        if aa in aa_list:
            out_protein.append(aa_list.index(aa))
        else:
            # aa which are not in the "aminoacids list".
            out_protein.append(20) 
    return out_protein
        

def save_to_file(seq_vector):   
    ''' Function to save the arrays in a npz file'''
    list_proteins = []    
    for seq in seq_vector:
        list_proteins.append(encode_aa(seq))
    save_file = 'ER_dataset.npz'
    np.savez_compressed(save_file,
                    input=list_proteins,
                    length=lenght_vector,
                    label=label_vector,
                    partition=cl_number)     
    return save_file