In [38]:
# Imports
import numpy as np
from typing import List, Tuple
from numpy.typing import ArrayLike

In [68]:
def sample_seqs(seqs: List[str], labels: List[bool]) -> Tuple[List[str], List[bool]]:
    """
    This function should sample the given sequences to account for class imbalance. 
    Consider this a sampling scheme with replacement.
    
    Args:
        seqs: List[str]
            List of all sequences.
        labels: List[bool]
            List of positive/negative labels

    Returns:
        sampled_seqs: List[str]
            List of sampled sequences which reflect a balanced class size
        sampled_labels: List[bool]
            List of labels for the sampled sequences
    """
    pass

def one_hot_encode_seqs(seq_arr: List[str]) -> ArrayLike:
    """
    This function generates a flattened one-hot encoding of a list of DNA sequences
    for use as input into a neural network.

    Args:
        seq_arr: List[str]
            List of sequences to encode.

    Returns:
        encodings: ArrayLike
            Array of encoded sequences, with each encoding 4x as long as the input sequence.
            For example, if we encode:
                A -> [1, 0, 0, 0]
                T -> [0, 1, 0, 0]
                C -> [0, 0, 1, 0]
                G -> [0, 0, 0, 1]
            Then, AGA -> [1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0].
    """
    encoded_allseq=[]
    
    nuc_dict={'A':[1,0,0,0], 
              'T':[0,1,0,0], 
              'C':[0,0,0,1], 
              'G':[0,0,0,1]}
    
    
    for i, seq in enumerate(seq_arr):
    
        encode_per_nuc=([nuc_dict[n] for n in seq.upper() if n in nuc_dict])
        encode_seq= [i for per_nuc in encode_per_nuc for i in per_nuc]
        encoded_allseq.append(encode_seq)

    return(encoded_allseq)

In [69]:
one_hot_encode_seqs(['AGACG', 'ACGCT', 'tggAc'])

[[1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
 [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]]

In [33]:
#read in fasta negatives
neg_seq_list=[]
with open('./data/yeast-upstream-1k-negative.fa', "r") as fasta:
    for seq in fasta:
        seq=seq.strip()
        if not seq.startswith(">"):
            neg_seq_list.append(seq)
            
            
#read in txt file positives
pos_seq_list=[]
with open('./data/rap1-lieb-positives.txt', "rt") as txt:
    for seq in txt:
        pos_seq_list.append(seq.strip())

In [35]:
neg_seq_list[1:5]

['GAAAACGGTATTCGAAGGATTCATAGCAGCTTGATTCTTAGCAGCATCACCAATCAATCT',
 'TTCAGTGTCAGTGAAAGCGACAAAAGATGGAGTGGTTCTGTTACCTTGATCGTTGGCAAT',
 'AATGTCCACACGATCATTAGCAAAGTGAGCAACACACGAGTATGTTGTACCTAAATCAAT',
 'ACCGACAGCTTTTGACATATTATCTGTTATTTACTTGAATTTTTGTTTCTTGTAATACTT']

In [36]:
pos_seq_list[1:5]

['ACACCCAGACATCGGGC',
 'CCACCCGTACCCATGAC',
 'GCACCCATACATTACAT',
 'ACATCCATACACCCTCT']