In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

In [None]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/motif_analysis/'

# Seed motifs to sequences

In [83]:
WINDOW_WIDTH = 4096
motif_len = 5

In [93]:
motifs_df = pd.read_csv(data_dir + 'eclip.tsv', sep='\t').set_index('motif_id')

  motifs_df = pd.read_csv(data_dir + 'eclip.tsv', sep='\t').set_index('motif_id')


In [94]:
# get coordinates of eCLIP peaks for each sequence

utr_table = pd.read_csv(data_dir + '../UTR_coords/GRCh38_3_prime_UTR_clean.bed', sep='\t',
                       usecols=[0,1,2,3,5], names=['chrom','utr_start','utr_end','seq_name','strand']) #absolute coordinates of 3'UTR sequences

motifs_df = motifs_df.merge(utr_table, how='left')

In [95]:
import pysam

human_fasta = pysam.FastaFile(data_dir + '../fasta/hg38.fa')

In [96]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [121]:
def extract_utr_seq(motif, seq_len, rc_on_negative=False, mask_beyond_3utr=False):

    if motif.strand=='+':
        pos_rel = motif.pos
    else:
        pos_rel = motif.pos + motif_len 
        
    refseq = human_fasta.fetch(motif.chrom, pos_rel-seq_len//2, pos_rel+seq_len//2).upper()

    refseq = np.array(list(refseq))

    if mask_beyond_3utr:
        refseq[:max(motif.utr_start-(pos_rel-seq_len//2),0)] = 'N'
        if motif.pos+seq_len//2>motif.utr_end:
            refseq[motif.utr_end-(pos_rel+seq_len//2):] = 'N'
            
    refseq = ''.join(refseq)

    #for genes on the negative strand, take reverse complement
    if rc_on_negative and motif.strand=='-':
      refseq = reverse_complement(refseq)

    if rc_on_negative:
        assert refseq[seq_len//2:seq_len//2+motif_len]==motif.motif
    else:
        if motif.strand=='-':
            assert reverse_complement(refseq)[seq_len//2:seq_len//2+motif_len]==motif.motif
        else:
            assert refseq[seq_len//2:seq_len//2+motif_len]==motif.motif

    return refseq

In [None]:
#mask_beyond_3utr =True
#rc_on_negative=False
#for motif_id, motif in tqdm(motifs_df.sample(n=1000,random_state=1).iterrows(), total=len(motifs_df)):
#            refseq = extract_utr_seq(motif, WINDOW_WIDTH, rc_on_negative,mask_beyond_3utr)

In [38]:
def write_fasta(fasta_path, seq_len, rc_on_negative,mask_beyond_3utr):

    c = 0
    
    with open(fasta_path, 'w') as f:
        for motif_id, motif in tqdm(motifs_df.iterrows(), total=len(motifs_df)):
            refseq = extract_utr_seq(motif, seq_len, rc_on_negative,mask_beyond_3utr)
            f.write(f'>{motif_id}\n')
            for line in wrap(refseq, 80): #wrap reference sequence with standard FASTA width
                f.write(line+'\n')
            c += 1
    return c

In [39]:
os.makedirs(data_dir + 'fasta', exist_ok=True)

N_pairs=write_fasta(data_dir + f'fasta/motifs_dna_fwd.fa',WINDOW_WIDTH,False,False)
print(f'{N_pairs} pairs written')

N_pairs=write_fasta(data_dir + f'fasta/motifs_rna.fa',WINDOW_WIDTH,True,True)
print(f'{N_pairs} pairs written')

100%|██████████| 50228/50228 [06:48<00:00, 122.96it/s]


50228 pairs written


100%|██████████| 50228/50228 [04:13<00:00, 198.33it/s]


50228 pairs written


In [40]:
! samtools faidx {data_dir}/selected/variants_rna.fa

In [41]:
! samtools faidx {data_dir}/selected/variants_dna_fwd.fa