In [1]:
import pandas as pd
import numpy as np
import pysam
from textwrap import wrap

In [2]:
add_seq_primers = False

In [39]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/mpra/griesemer_2021/'

In [15]:
human_fasta = pysam.FastaFile(data_dir + '../../fasta/Homo_sapiens_dna_fwd.fa')

In [49]:
utr_variants = pd.read_csv(data_dir + 'GRCh38_UTR_variants.tsv', sep='\t') #all positions are 0-based [start, end)

In [50]:
utr_variants = utr_variants[utr_variants.other_var_in_oligo_window.isna()] #seeding multiple variants into oligo sequence isn't currently supported

In [51]:
len(utr_variants)

18376

In [52]:
utr_variants['vartype'] = utr_variants.apply(lambda x: 'SNP' if len(x.ref)==len(x.alt) else
                                            'DEL' if len(x.ref)>len(x.alt) else 'INS', axis=1)

In [53]:
utr_variants.sort_values(by='oligo_id', inplace=True)

In [54]:
utr_variants['stop_codon_dist'] = utr_variants.apply(lambda x: x.var_start-x.human_UTR_start if x.strand=='+' else
                  x.human_UTR_end - x.var_start, axis=1)

In [32]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [33]:
#oligo primers, don't think we need to include them

primer_5_end = 'CGAGCTCGCTAGCCT'
primer_3_end = 'AGATCGGAAGAGCGTCG'

In [34]:
def insert_variant(seq, oligo):
    
    varpos = int(oligo.var_start - oligo.oligo_start) #variant position w.r.t. oligo coordinates
    
    if oligo.vartype!='DEL':
        seq[varpos] = oligo.alt
    else:
        seq[varpos+1:varpos+len(oligo.ref)] = ''
        
    #if oligo.vartype!='DEL':
    #    seq = seq[:varpos] + oligo.alt + seq[varpos+1:]
    #else:
    #    seq = seq[:varpos] + oligo.alt + seq[varpos+len(oligo.ref):]
    
    return seq
        
def check_ref(seq, oligo):

    varpos = int(oligo.var_start - oligo.oligo_start) #variant position w.r.t. oligo coordinates

    #detect reference mismatches
    if oligo.vartype != 'DEL' and seq[varpos] != oligo.ref:
        return False
    elif oligo.vartype == 'DEL' and ''.join(seq[varpos:varpos+len(oligo.ref)]) != oligo.ref:
        return False
    
    return True

In [36]:
def extract_utr_seq(oligo, complement_negative=False):

    seq = human_fasta.fetch(oligo.UTR_ID).upper()
    
    #oligo position within UTR region
    oligo_start = int(oligo.oligo_start - oligo.human_UTR_start)
    oligo_end = int(oligo.oligo_end - oligo.human_UTR_start)
    #extract oligo seqeunce, no primers here
    seq = seq[oligo_start:oligo_end]

    seq = list(seq)
    
    if not check_ref(seq, oligo):
        return (None, None)
    
    #seed variant if alt sequence
    if oligo.tag == 'alt':
        
        seq = insert_variant(seq, oligo)
        
    if oligo.mpra_variant_id.endswith('_2'):
        
        for other_var_id in oligo.other_var_in_oligo_window.split(','):
            
            if not other_var_id in utr_variants.variant_id.unique():                
                return (None, None)
                        
            other_var = utr_variants[utr_variants.variant_id==other_var_id].iloc[0]
            
            if not check_ref(seq, other_var):
                return (None, None)
            
            seq = insert_variant(seq, other_var)
            
    seq = ''.join(seq)

    if add_seq_primers:
        seq = primer_5_end + seq + primer_3_end

    #for genes on the negative strand, take reverse complement
    if oligo.strand=='-' and complement_negative:
        seq = reverse_complement(seq)

    #FASTA sequence header
    seq_header = f'>{oligo.oligo_id}'
    
    return seq_header, seq

In [37]:
n_mismatches = 0

#write FASTA file with variants

def write_fasta(output_fasta, complement_negative=False):
    with open(output_fasta, 'w') as f:
        for idx, oligo in utr_variants.iterrows():
            seq_header, seq = extract_utr_seq(oligo, complement_negative=complement_negative)
            if seq_header != None:
                f.write(seq_header+'\n')
                for line in wrap(seq, 80): #wrap sequence with standard FASTA width
                    f.write(line+'\n')
            else:
                n_mismatches += 1
                
    print(f'Variants with mismatched reference: {n_mismatches//2}')

In [None]:
write_fasta(data_dir + 'fasta/variants_dna_fwd.fa', complement_negative=False) 

In [None]:
write_fasta(data_dir + 'fasta/variants_rna.fa', complement_negative=True) 

In [44]:
! ./RNAfold.sh {data_dir}/fasta/variants_rna.fa  {data_dir}/fasta/free_energy.tsv

# Combine mpra_df

In [55]:
expression_df = pd.read_csv(data_dir + 'source_data/paper_supplementary/Variant_MPRAu_Results.txt', sep='\t')
oligo_info_df = pd.read_csv(data_dir + 'source_data/paper_supplementary/Oligo_Variant_Info.txt', sep='\t')

In [56]:
#Add seqeunces from FASTA file

fasta_fa = data_dir + 'fasta/variants_rna.fa'

seq = {}

with open(fasta_fa, 'r') as f:
    for line in f:
        if line.startswith('>'):
            oligo_id = line[1:].rstrip()
            seq[oligo_id] = ''
        else:
            seq[oligo_id] += line.rstrip()

mpra_df = pd.DataFrame(seq.items(), columns=['oligo_id', 'seq'])

In [57]:
#minimal free energy from RNAfold software, recalculate each time when FASTA file changes!

mpra_df['min_free_energy'] = pd.read_csv(data_dir + 
'fasta/free_energy.tsv', header=None).squeeze() 

In [58]:
#combine all information together

mpra_df = mpra_df.merge(oligo_info_df[['oligo_id','mpra_variant_id','ref_allele','alt_allele', 'gene_symbols','chrom', 'oligo_starts', 'oligo_ends']]).merge(expression_df).merge(utr_variants[['mpra_variant_id','stop_codon_dist']].drop_duplicates())

mpra_df.columns

Index(['oligo_id', 'seq', 'min_free_energy', 'mpra_variant_id', 'ref_allele',
       'alt_allele', 'gene_symbols', 'chrom', 'oligo_starts', 'oligo_ends',
       'log2FoldChange_Skew_HEK293FT', 'lfcSE_Skew_HEK293FT',
       'pvalue_Skew_HEK293FT', 'padj_Skew_HEK293FT',
       'log2FoldChange_Ref_HEK293FT', 'lfcSE_Ref_HEK293FT',
       'pvalue_Ref_HEK293FT', 'padj_Ref_HEK293FT',
       'log2FoldChange_Alt_HEK293FT', 'lfcSE_Alt_HEK293FT',
       'pvalue_Alt_HEK293FT', 'padj_Alt_HEK293FT', 'log2FoldChange_Skew_HEPG2',
       'lfcSE_Skew_HEPG2', 'pvalue_Skew_HEPG2', 'padj_Skew_HEPG2',
       'log2FoldChange_Ref_HEPG2', 'lfcSE_Ref_HEPG2', 'pvalue_Ref_HEPG2',
       'padj_Ref_HEPG2', 'log2FoldChange_Alt_HEPG2', 'lfcSE_Alt_HEPG2',
       'pvalue_Alt_HEPG2', 'padj_Alt_HEPG2', 'log2FoldChange_Skew_HMEC',
       'lfcSE_Skew_HMEC', 'pvalue_Skew_HMEC', 'padj_Skew_HMEC',
       'log2FoldChange_Ref_HMEC', 'lfcSE_Ref_HMEC', 'pvalue_Ref_HMEC',
       'padj_Ref_HMEC', 'log2FoldChange_Alt_HMEC', 'lfcSE_A

In [59]:
# Define Groups for Group K-fold based on genes

mpra_df['group'] = mpra_df['gene_symbols']

In [60]:
#Very few 3'UTR regions are shared between genes
#The corresponding oligos should have the same group label

df = mpra_df.sort_values(by=['chrom','oligo_starts','oligo_ends']).drop_duplicates(subset=['mpra_variant_id']).reset_index(drop=True)

df['other_mpra_variant_id'] = ''

for row_idx in range(len(df)-1):
    next_row_idx = row_idx+1
    while df.loc[row_idx].chrom==df.loc[next_row_idx].chrom and df.loc[next_row_idx].oligo_starts<df.loc[row_idx].oligo_ends:
        if df.loc[row_idx].gene_symbols != df.loc[next_row_idx].gene_symbols:
            df.loc[row_idx,'other_mpra_variant_id'] += df.loc[next_row_idx,'mpra_variant_id'] + ','
        next_row_idx+=1
        
df = df.loc[df.other_mpra_variant_id!='',['mpra_variant_id','other_mpra_variant_id']]  

for mpra_variant_id, other_mpra_variant_ids in df.values.tolist():
    for other_mpra_variant_id in other_mpra_variant_ids.rstrip(',').split(','):
        mpra_df.loc[mpra_df.mpra_variant_id==other_mpra_variant_id,'group'] = mpra_df.loc[mpra_df.mpra_variant_id==mpra_variant_id,'group'].iloc[0]

In [61]:
mpra_df.to_csv(data_dir + 'mpra_rna.tsv', index=None, sep='\t')