In [1]:
import pandas as pd
import numpy as np
import pysam
from textwrap import wrap

In [2]:
add_seq_primers = False

In [3]:
datadir = '/lustre/groups/epigenereg01/workspace/projects/vale/MLM/'

In [5]:
human_fasta = pysam.FastaFile(datadir + 'fasta/Homo_sapiens_no_reverse.fa')

In [6]:
utr_variants = pd.read_csv(datadir + 'griesemer/variants/GRCh38_UTR_variants.tsv', sep='\t') #all positions are 0-based [start, end)

In [7]:
utr_variants = utr_variants[utr_variants.other_var_in_oligo_window.isna()] #seeding multiple variants into oligo sequence isn't currently supported

In [8]:
utr_variants['vartype'] = utr_variants.apply(lambda x: 'SNP' if len(x.ref)==len(x.alt) else
                                            'DEL' if len(x.ref)>len(x.alt) else 'INS', axis=1)

In [9]:
utr_variants.sort_values(by='oligo_id', inplace=True)

In [10]:
#utr_variants['stop_codon_dist'] = utr_variants.apply(lambda x: x.var_start-x.human_UTR_start if x.strand=='+' else
#                  x.human_UTR_end - x.var_start, axis=1)

In [11]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [12]:
#oligo primers, don't think we need to include them

primer_5_end = 'CGAGCTCGCTAGCCT'
primer_3_end = 'AGATCGGAAGAGCGTCG'

In [13]:
def insert_variant(seq, oligo):
    
    varpos = int(oligo.var_start - oligo.oligo_start) #variant position w.r.t. oligo coordinates
    
    if oligo.vartype!='DEL':
        seq[varpos] = oligo.alt
    else:
        seq[varpos+1:varpos+len(oligo.ref)] = ''
        
    #if oligo.vartype!='DEL':
    #    seq = seq[:varpos] + oligo.alt + seq[varpos+1:]
    #else:
    #    seq = seq[:varpos] + oligo.alt + seq[varpos+len(oligo.ref):]
    
    return seq
        
def check_ref(seq, oligo):

    varpos = int(oligo.var_start - oligo.oligo_start) #variant position w.r.t. oligo coordinates

    #detect reference mismatches
    if oligo.vartype != 'DEL' and seq[varpos] != oligo.ref:
        return False
    elif oligo.vartype == 'DEL' and ''.join(seq[varpos:varpos+len(oligo.ref)]) != oligo.ref:
        return False
    
    return True

In [14]:
def extract_utr_seq(oligo, complement_negative=False):

    seq = human_fasta.fetch(oligo.UTR_ID).upper()
    
    #oligo position within UTR region
    oligo_start = int(oligo.oligo_start - oligo.human_UTR_start)
    oligo_end = int(oligo.oligo_end - oligo.human_UTR_start)
    #extract oligo seqeunce, no primers here
    seq = seq[oligo_start:oligo_end]

    seq = list(seq)
    
    if not check_ref(seq, oligo):
        return (None, None)
    
    #seed variant if alt sequence
    if oligo.tag == 'alt':
        
        seq = insert_variant(seq, oligo)
        
    if oligo.mpra_variant_id.endswith('_2'):
        
        for other_var_id in oligo.other_var_in_oligo_window.split(','):
            
            if not other_var_id in utr_variants.variant_id.unique():                
                return (None, None)
                        
            other_var = utr_variants[utr_variants.variant_id==other_var_id].iloc[0]
            
            if not check_ref(seq, other_var):
                return (None, None)
            
            seq = insert_variant(seq, other_var)
            
    seq = ''.join(seq)

    if add_seq_primers:
        seq = primer_5_end + seq + primer_3_end

    #for genes on the negative strand, take reverse complement
    if oligo.strand=='-' and complement_negative:
        seq = reverse_complement(seq)

    #FASTA sequence header
    seq_header = f'>{oligo.oligo_id}:Homo_sapiens'
    
    return seq_header, seq

In [None]:
n_mismatches = 0

#write FASTA file with variants

def write_fasta(output_fasta, complement_negative=False):
    with open(output_fasta, 'w') as f:
        for idx, oligo in utr_variants.iterrows():
            seq_header, seq = extract_utr_seq(oligo, complement_negative=complement_negative)
            if seq_header != None:
                f.write(seq_header+'\n')
                for line in wrap(seq, 80): #wrap sequence with standard FASTA width
                    f.write(line+'\n')
            else:
                n_mismatches += 1
                
    print(f'Variants with mismatched reference: {n_mismatches//2}')

In [None]:
write_fasta(data_dir + 'fasta/variants_dna_fwd.fa', complement_negative=False) 

! ./RNAfold.sh {data_dir}/variants/fasta/variants_rna.fa

In [None]:
write_fasta(data_dir + 'fasta/variants_rna.fa', complement_negative=True) 

! ./RNAfold.sh {data_dir}/variants/fasta/variants_rna.fa