In [14]:
import numpy as np
import pandas as pd
from collections import defaultdict
from textwrap import wrap

In [15]:
data_dir = '/s/project/mll/sergey/effect_prediction/MLM/'

In [16]:
#3'UTR table

utr_table = pd.read_csv(data_dir + 'UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [17]:
utr_variants = pd.read_csv(data_dir + 'clinvar/clinvar.3utr.tsv', sep='\t', names=['chrom','pos','var_id','ref','alt','seq_name','label'])

In [18]:
utr_variants.pos = utr_variants.pos-1 #to 0-based

In [19]:
utr_variants = utr_variants[utr_variants.alt!='.'] #around 30 variants, not reliable

In [20]:
utr_variants = utr_variants.merge(utr_table, how='left')

In [21]:
utr_variants['ref_len'] = utr_variants.ref.apply(len)
utr_variants['alt_len'] = utr_variants.alt.apply(len)

In [22]:
utr_variants = utr_variants[utr_variants[['ref_len','alt_len']].min(axis=1)==1] #remove MNP variants

In [23]:
utr_variants['varlen'] = abs(utr_variants.ref_len - utr_variants.alt_len)
utr_variants['dist_to_utr_start'] = utr_variants.apply(lambda x: x.varlen+x.pos-x.seq_start if x.strand=='+' else  x.varlen+x.seq_end-x.pos, axis=1)

In [24]:
utr_variants = utr_variants[utr_variants.dist_to_utr_start<5000] #limit by the maximum sequence length on species-aware model

In [25]:
utr_variants['is_del'] = False
utr_variants.loc[utr_variants.ref_len > utr_variants.alt_len, 'is_del'] = True

In [182]:
len(utr_variants)

21350

In [183]:
human_fasta = data_dir + 'fasta/Homo_sapiens_no_reverse.fa'

In [184]:
human_utr = defaultdict(str)

with open(human_fasta, 'r') as f:
    for line in f:
        if line.startswith('>'):
            seq_name = line[1:].split(':')[0].rstrip()
        else:
            human_utr[seq_name] += line.upper().rstrip()

In [196]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [203]:
def extract_utr_seq(var):

    seq = human_utr[var.seq_name]

    seq = list(seq)
    
    varpos = int(var.pos - var.seq_start) #relative variant position in the sequence
    
    if not var.is_del:
        if seq[varpos] != var.ref:
            return (None, None)
        else:
            seq[varpos] = var.alt
    else:
        if ''.join(seq[varpos:varpos+len(var.ref)]) != var.ref:
            return (None, None)
        else:
            seq[varpos+1:varpos+len(var.ref)] = ''
                 
    seq = ''.join(seq)

    #for genes on the negative strand, take reverse complement
    if var.strand=='-':
        seq = reverse_complement(seq)

    #FASTA sequence header
    seq_header = f'>ClinvarID_{var.var_id}:Homo_sapiens'
    
    return seq_header, seq

In [204]:
output_fasta = data_dir + 'clinvar/clinvar.fa'

In [205]:
n_mismatches = 0

#write FASTA file with variants

with open(output_fasta, 'w') as f:
    for idx, var in utr_variants.iterrows():
        seq_header, seq = extract_utr_seq(var)
        if seq_header != None:
            f.write(seq_header+'\n')
            for line in wrap(seq, 80): #wrap sequence with standard FASTA width
                f.write(line+'\n')
        else:
            n_mismatches += 1
            
print(f'Variants with mismatched reference: {n_mismatches//2}')

Variants with mismatched reference: 40


In [206]:
# index new FASTA file
!/s/project/mll/sergey/variant_calling/tools/samtools/bin/samtools faidx {output_fasta}