In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from textwrap import wrap

In [2]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/'

In [3]:
#3'UTR table

utr_table = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [4]:
utr_variants = pd.read_csv(data_dir + 'clinvar/clinvar.3utr.tsv', sep='\t', 
                           names=['chrom','pos','var_id','ref','alt','seq_name','label','PhyloP100','PhyloP241'],)

In [5]:
utr_variants.pos = utr_variants.pos-1 #to 0-based

In [6]:
utr_variants = utr_variants[utr_variants.alt!='.'] #around 30 variants, not reliable

In [7]:
utr_variants['ref_len'] = utr_variants.ref.apply(len)
utr_variants['alt_len'] = utr_variants.alt.apply(len)

In [8]:
utr_variants = utr_variants[utr_variants[['ref_len','alt_len']].min(axis=1)==1] #remove MNP variants

In [9]:
utr_variants = utr_variants.merge(utr_table, how='left')

In [10]:
utr_variants['varlen'] = abs(utr_variants.ref_len - utr_variants.alt_len)
utr_variants['dist_to_utr_start'] = utr_variants.apply(lambda x: x.varlen+x.pos-x.seq_start if x.strand=='+' else  x.varlen+x.seq_end-x.pos, axis=1)

In [11]:
utr_variants = utr_variants[utr_variants.dist_to_utr_start<5000] #limit by the maximum sequence length on species-aware model

In [12]:
utr_variants['is_del'] = False
utr_variants.loc[utr_variants.ref_len > utr_variants.alt_len, 'is_del'] = True

In [13]:
len(utr_variants)

21350

In [14]:
human_fasta = data_dir + 'fasta/Homo_sapiens_no_reverse.fa'

In [15]:
human_utr = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0].rstrip()
        else:
            human_utr[seq_name] += line.upper().rstrip()

In [16]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [17]:
SEQ_LENGTH = 510 #maximal sequence length for DNABERT

def center_seq(seq, varpos):
    
    seq = list(seq)
    
    centered_seq = ['N']*SEQ_LENGTH

    left_seq = seq[max(0,varpos-SEQ_LENGTH//2):varpos] #part of the input sequence to the left of varpos
    right_seq = seq[varpos:varpos+SEQ_LENGTH//2] #part of the input sequence to the right of varpos
    
    #insert left and right parts of the input sequence to the centered sequence
    centered_seq[SEQ_LENGTH//2:SEQ_LENGTH//2+len(right_seq)] =  right_seq
    centered_seq[SEQ_LENGTH//2-len(left_seq):SEQ_LENGTH//2] = left_seq
    
    centered_seq = ''.join(centered_seq)
    
    return centered_seq

In [18]:
def extract_utr_seq(var):

    refseq = human_utr[var.seq_name]

    varpos = int(var.pos - var.seq_start) #relative variant position in the sequence
    
    altseq = list(refseq)
     
    if not var.is_del:
        if refseq[varpos] != var.ref:
            return (None, None)
        else:
            altseq[varpos] = var.alt
    else:
        if refseq[varpos:varpos+len(var.ref)] != var.ref:
            return (None, None)
        else:
            altseq[varpos+1:varpos+len(var.ref)] = ''
                 
    altseq = ''.join(altseq)
    
    refseq = center_seq(refseq, varpos) #center the sequence around the variant and crop
    altseq = center_seq(altseq, varpos)

    #for genes on the negative strand, take reverse complement
    if var.strand=='-':
        altseq = reverse_complement(altseq)
        refseq = reverse_complement(refseq)

    return refseq, altseq

In [53]:
#for each variant, get reference and alternative sequences of SEQ_LENGTH centered around the variant

seqs = []

for idx, var in utr_variants.iterrows():
    refseq, altseq = extract_utr_seq(var)
    if not refseq is None:
        seqs.append((var.var_id,refseq,altseq))    

In [54]:
seqs_df = pd.DataFrame(seqs, columns=['var_id','refseq','altseq'])

In [55]:
seqs_df.to_csv(data_dir + 'clinvar/dnabert/seqs.csv', index=None)

In [68]:
#get unique reference sequences for DNABERT calibration and proba-based prediction

refseqs = []

for _, var in utr_variants.iterrows():
    seq = human_utr[var.seq_name]
    if var.strand == '-':
        seq = reverse_complement(seq)
    refseqs.append((var.seq_name, seq))

In [69]:
refseqs_df = pd.DataFrame(refseqs, columns=['seq_name','seq']).drop_duplicates()

In [70]:
refseqs_df.to_csv(data_dir + 'clinvar/dnabert/clinvar_refseq.csv', index=None)