In [1]:
import numpy as np
import pandas as pd
from textwrap import wrap
from tqdm import tqdm
import pysam

In [2]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A','a':'t', 'c':'g', 'g':'c', 't':'a'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [3]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/mpra/siegel_2022/'

In [28]:
cell_type = 'Beas2B' #  Beas2B or Jurkat

In [29]:
#Sequences on the negative strand are already reverse-complemented!

mpra_df = pd.read_csv(data_dir + f'siegel_supplemantary/sequence_level_data_{cell_type}.csv')

In [30]:
mpra_df = mpra_df[~mpra_df.parent_control_oligo.isna()] #remove variants with unknown reference (only 20)
mpra_df['gene'] = mpra_df['region'].apply(lambda x:x.split('|')[0])
mpra_df.drop(columns='region', inplace=True)

In [31]:
regions_utr_map = pd.read_csv(data_dir + 'preprocessing/regions_hg38/regions_3UTR_GRCh38.bed', sep='\t',
                             names = ['chrom','oligo_start','oligo_end','ids','utr_start','utr_end','utr_name','strand','gene']) #mapping between regions and 3'UTR coordinates

regions_utr_map = regions_utr_map[(regions_utr_map.oligo_start>=regions_utr_map.utr_start) & 
    (regions_utr_map.oligo_end<=regions_utr_map.utr_end)].drop_duplicates() #region should be entirely within 3'UTR

In [32]:
mpra_df = mpra_df.merge(regions_utr_map, how='left')
mpra_df = mpra_df[~mpra_df.utr_name.isna()]

In [33]:
#if there're more than one 3'UTRs for a given variant, take the longest
mpra_df['utr_len'] = mpra_df.utr_end-mpra_df.utr_start
mpra_df = mpra_df.sort_values(by='utr_len',ascending=False).drop_duplicates(['ids'],keep='first')#.drop(columns='utr_len')

In [34]:
human_fasta = pysam.FastaFile(data_dir + '../../fasta/hg38.fa')

In [35]:
ref_df = mpra_df[mpra_df.iscontrol==1].set_index('ids')

checked_ids = []

for oligo_id,oligo in tqdm(ref_df.iterrows(),total=len(ref_df)):
    refseq = human_fasta.fetch(oligo.chrom, oligo.oligo_start, oligo.oligo_end).upper()
    if oligo.strand=='-':
        refseq = reverse_complement(refseq)
    n_mismatches = np.sum([s1!=s2 for s1,s2 in zip(oligo.seq,refseq)])
    if n_mismatches==0:
        checked_ids.append(oligo_id)

100%|██████████| 11411/11411 [00:01<00:00, 8999.51it/s]


In [36]:
mpra_df.ids.is_unique

True

In [37]:
ref_df = ref_df.loc[checked_ids]
alt_df = mpra_df[(mpra_df.iscontrol==0)&(mpra_df.parent_control_oligo.isin(checked_ids))]

ref_df = ref_df.loc[alt_df.parent_control_oligo].reset_index() #get the reference for each alternative
#now ref_df and alt_df are of the same length

In [38]:
alt_df.index = range(0,len(alt_df)*2,2) #index from 0 to -1

ref_df.index = range(1,1+len(ref_df)*2,2)#index from 1 to -1

In [39]:
ref_df.issnp = alt_df.issnp.values

In [40]:
mpra_df = pd.concat([alt_df,ref_df]).sort_index() #now alternative and reference sequences alternate

In [41]:
mpra_df.to_csv(data_dir + f'/preprocessing/{cell_type}.tsv', sep='\t', index=None)

# Seed oligos to sequences

In [45]:
WINDOW_WIDTH = None

In [46]:
oligos_df = pd.read_csv(data_dir + f'/preprocessing/Jurkat.tsv', sep='\t') 

In [47]:
oligos_df = oligos_df.drop_duplicates(subset='ids').set_index('ids')

In [48]:
def extract_oligo_seq(oligo, seq_len, 
                      rc_on_negative=False, seq_clip_3utr=False,oligo_to_lower=False):
    
    oligo_seq = oligo.seq
    
    if oligo.strand == '-':
        #initial sequences are reverse complemented for negative genes, we rreverse complement them back to match DNA
        oligo_seq = reverse_complement(oligo_seq)

    if seq_len is not None:

        L = len(oligo_seq)
        
        left = oligo.oligo_start-(seq_len-L)//2
        right = oligo.oligo_end+(seq_len-L)//2
    
        if seq_clip_3utr:
            left = max(left,oligo.utr_start)
            right = min(right,oligo.utr_end)
        
        flanking_left_seq = human_fasta.fetch(oligo.chrom, left, oligo.oligo_start).upper()
        flanking_right_seq = human_fasta.fetch(oligo.chrom, oligo.oligo_end, right).upper()

    else:

        flanking_left_seq = ''
        flanking_right_seq = ''
        
            
    if oligo_to_lower:
        oligo_seq = oligo_seq.lower()
        
    seq = flanking_left_seq + oligo_seq + flanking_right_seq

    #print(n_mismatches)
    #for genes on the negative strand, take reverse complement
    if rc_on_negative and oligo.strand=='-':
        seq = reverse_complement(seq)

    return seq

In [49]:
def write_fasta(fasta_path, **kwargs):

    c = 0
    
    with open(fasta_path, 'w') as f:
        for idx, oligo in tqdm(oligos_df.iterrows(), total=len(oligos_df)):
            seq = extract_oligo_seq(oligo, **kwargs)
            f.write(f'>{idx}\n')
            for line in wrap(seq, 80): #wrap reference sequence with standard FASTA width
                f.write(line+'\n')
            c += 1
            
    return c

In [50]:
human_fasta = pysam.FastaFile(data_dir + '../../fasta/hg38.fa')

In [51]:
if WINDOW_WIDTH is None:
    oligo_to_lower=False
else:
    oligo_to_lower=True

N_pairs=write_fasta(data_dir + 'fasta/variants_dna_fwd.fa',seq_len=WINDOW_WIDTH,rc_on_negative=False,
                   seq_clip_3utr=False,oligo_to_lower=oligo_to_lower)

print(f'{N_pairs} pairs written')

N_pairs=write_fasta(data_dir + 'fasta/variants_rna.fa',seq_len=WINDOW_WIDTH,rc_on_negative=True,
                   seq_clip_3utr=True,oligo_to_lower=oligo_to_lower)

print(f'{N_pairs} pairs written')

100%|██████████| 35285/35285 [01:43<00:00, 341.51it/s]


35285 pairs written


100%|██████████| 35285/35285 [00:30<00:00, 1141.02it/s]


35285 pairs written


In [52]:
! samtools faidx {data_dir}/fasta/variants_rna.fa

In [53]:
! samtools faidx {data_dir}/fasta/variants_dna_fwd.fa