In [88]:
import pandas as pd
import numpy as np
import pysam
from textwrap import wrap

In [None]:
add_seq_primers = False

In [3]:
datadir = '/s/project/mll/sergey/effect_prediction/MLM/'

In [84]:
output_fasta = datadir + 'griesemer/fasta/GRCh38_UTR_variants.fa'

In [7]:
human_fasta = pysam.FastaFile(datadir + 'fasta/Homo_sapiens_no_reverse.fa')

In [79]:
utr_variants = pd.read_csv(datadir + 'griesemer/GRCh38_UTR_variants.tsv', sep='\t') #all positions are 0-based [start, end)

In [80]:
utr_variants = utr_variants[utr_variants.other_var_in_oligo_window.isna()] #seeding multiple variants into oligo sequence isn't currently supported

In [81]:
utr_variants['vartype'] = utr_variants.apply(lambda x: 'SNP' if len(x.ref)==len(x.alt) else
                                            'DEL' if len(x.ref)>len(x.alt) else 'INS', axis=1)

In [82]:
#utr_variants['stop_codon_dist'] = utr_variants.apply(lambda x: x.var_start-x.human_UTR_start if x.strand=='+' else
#                  x.human_UTR_end - x.var_start, axis=1)

In [28]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [None]:
#oligo primers, don't think we need to include them

primer_5_end = 'CGAGCTCGCTAGCCT'
primer_3_end = 'AGATCGGAAGAGCGTCG'

In [76]:
def extract_utr_seq(oligo):

    seq = human_fasta.fetch(oligo.UTR_ID).upper()
    
    #oligo position within UTR region
    oligo_start = int(oligo.oligo_start - oligo.human_UTR_start)
    oligo_end = int(oligo.oligo_end - oligo.human_UTR_start)
    #extract oligo seqeunce, no primers here
    seq = seq[oligo_start:oligo_end]

    varpos = int(oligo.var_start - oligo.oligo_start) #variant position w.r.t. oligo coordinates

    #detect reference mismatches
    if oligo.vartype != 'DEL' and seq[varpos] != oligo.ref:
        return (None,None)
    elif oligo.vartype == 'DEL' and seq[varpos:varpos+len(oligo.ref)] != oligo.ref:
        return (None,None)
            
    #seed variant if alt sequence
    if oligo.tag == 'alt':

        if oligo.vartype!='DEL':
            seq = seq[:varpos] + oligo.alt + seq[varpos+1:]
        else:
            seq = seq[:varpos] + oligo.alt + seq[varpos+len(oligo.ref):]

    if add_seq_primers:
        seq = primer_5_end + seq + primer_3_end

    #for genes on the negative strand, take reverse complement
    if oligo.strand=='-':
        seq = reverse_complement(seq)

    #FASTA sequence header
    seq_header = f'>{oligo.mpra_variant_id}_{oligo.tag}_{oligo.vartype}:Homo_sapiens:{len(seq)}'
    
    return seq_header, seq

In [98]:
n_mismatches = 0

#write FASTA file with variants

with open(output_fasta, 'w') as f:
    for idx, oligo in utr_variants.iterrows():
        seq_header, seq = extract_utr_seq(oligo)
        if seq_header != None:
            f.write(seq_header+'\n')
            for line in wrap(seq, 80): #wrap sequence with standard FASTA width
                f.write(line+'\n')
        else:
            n_mismatches += 1
            
print(f'Variants with mismatched reference: {n_mismatches//2}')

Variants with miscmatched reference: 54


In [99]:
# index new FASTA file
!/s/project/mll/sergey/variant_calling/tools/samtools/bin/samtools faidx {output_fasta}