In [19]:
import numpy as np
import pandas as pd
from textwrap import wrap
from tqdm import tqdm

In [20]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/MLM/siegel_2022/'

In [35]:
cell_type = 'Beas2B' #  Beas2B or Jurkat

In [36]:
#Sequences on the negative strand are already reverse-complemented!

mpra_df = pd.read_csv(data_dir + f'data/supl/sequence_level_data_{cell_type}.csv')

In [37]:
mpra_df = mpra_df[~mpra_df.parent_control_oligo.isna()] #remove variants with unknown reference (only 20)

In [38]:
regions_utr_map = pd.read_csv(data_dir + 'data/regions_hg38/regions_3UTR_GRCh38.bed', sep='\t',
                             names = ['region_start','region_end','ids','utr_start','utr_end','strand']) #mapping between regions and 3'UTR coordinates

regions_utr_map = regions_utr_map[(regions_utr_map.region_start>=regions_utr_map.utr_start) & 
    (regions_utr_map.region_end<=regions_utr_map.utr_end)].drop_duplicates() #region should be entirely within 3'UTR

regions_utr_map = regions_utr_map.drop_duplicates(keep=False,subset='ids') #remove regions that span multiple UTRs

regions_utr_map['stop_codon_dist'] = regions_utr_map.apply(lambda x: x.region_end-x.utr_start 
                      if x.strand=='+' else x.utr_end - x.region_start, axis=1)  #distance to the stop codon, must be below 5000 for MLM

In [39]:
mpra_df = mpra_df.merge(regions_utr_map[['ids','stop_codon_dist','strand']], how='left')
mpra_df = mpra_df[~mpra_df.strand.isna()]

In [40]:
ref_df = mpra_df[mpra_df.iscontrol==1].set_index('ids') #reference id to index
alt_df = mpra_df[mpra_df.iscontrol==0]

ref_df = ref_df.loc[alt_df.parent_control_oligo].reset_index() #get the reference for each alternative

#now ref_df and alt_df are of the same length

In [41]:
alt_df.index = range(0,len(alt_df)*2,2) #index from 0 to -1

ref_df.index = range(1,1+len(ref_df)*2,2)#index from 1 to -1

In [42]:
ref_df.issnp = alt_df.issnp.values

In [43]:
mpra_df = pd.concat([alt_df,ref_df]).sort_index() #now alternative and reference sequences alternate

In [44]:
mpra_df.to_csv(data_dir + 'data/' + cell_type + '.tsv', sep='\t', index=None)

In [32]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq
    
#sequences are identical for both cell types!

def write_fasta(output_fasta, complement_negative=False):
    with open(output_fasta, 'w') as f:
        for seq_id, row in tqdm(mpra_df.iterrows(), total=len(mpra_df)):
            f.write(f'>id_{seq_id}:Homo_sapiens\n')
            if complement_negative and row.strand=='-':
                seq = reverse_complement(row.seq)
            else:
                seq = row.seq
            for line in wrap(seq, 80): #wrap sequence with standard FASTA width
                f.write(line+'\n')

In [33]:
write_fasta(data_dir + 'fasta/variants_rna.fa', complement_negative=False) #initial sequence are already reverse complemented to match mRNA

100%|██████████| 49836/49836 [00:02<00:00, 16859.25it/s]


In [34]:
write_fasta(data_dir + 'fasta/variants_dna_fwd.fa', complement_negative=True) 

100%|██████████| 49836/49836 [00:03<00:00, 14312.36it/s]
