In [5]:
import pickle
import os
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from glob import glob

In [6]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

In [9]:
def read_fasta(fasta):

    seqs = defaultdict(str)
    
    with open(fasta, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_name = line[1:].rstrip()
            else:
                seqs[seq_name] += line.rstrip().upper()
    return seqs

def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [8]:
# all 3'UTR FASTA alignments are distributed across subfolders
# find all FASTA files and their subfolders

fasta_path = data_dir + 'fasta/aligned_3UTR/fa/'

fasta_dirs = []

for file in glob(fasta_path + '**/*.fa', recursive=True):
    relative_path = os.path.relpath(file, fasta_path)
    folder, seq_id = relative_path.split('/')
    seq_id = seq_id.replace('.fa','')
    fasta_dirs.append((folder,seq_id))
    
fasta_dirs = pd.DataFrame(fasta_dirs, columns=['folder','seq_name']).set_index('seq_name').squeeze()

fasta_dirs.head()

seq_name
ENST00000394232.6_utr3_12_0_chr15_91292553_f    18
ENST00000291442.4_utr3_3_0_chr19_17231883_r     18
ENST00000310981.6_utr3_0_0_chr2_127702177_f     18
ENST00000418260.3_utr3_1_0_chr12_31615023_r     18
ENST00000380985.10_utr3_12_0_chr5_65822916_f    18
Name: folder, dtype: object

In [24]:
all_seqs, all_counts = [], []

for seq_name, subdir in tqdm(fasta_dirs.items(), total=len(fasta_dirs)):
    
    file_path = fasta_path + subdir + '/' + seq_name + '.fa'

    fasta = read_fasta(file_path)
    
    fasta_seqs = np.array([list(seq) for seq in fasta.values()])

    seq_counts = np.array([np.sum(fasta_seqs == nt,0) for nt in ['A','C','G','T']]).T

    homo_sapiens_seq = ''.join(fasta_seqs[0]).upper()
    
    all_counts.append(seq_counts)
    all_seqs.append(homo_sapiens_seq)

100%|██████████| 18178/18178 [30:00<00:00, 10.09it/s] 


In [41]:
all_probs = [x/x.sum(1, keepdims=True) for x in all_counts]

In [42]:
output_dir = data_dir + f'/human_3utr/probs/zoo-al/'

os.makedirs(output_dir, exist_ok=True)

with open(output_dir + 'predictions.pickle', 'wb') as f:
    pickle.dump({'seq_names':fasta_dirs.index.tolist(), 'seqs':all_seqs, 'probs':all_probs, 'counts':all_counts},f)