In [1]:
import pandas as pd
import numpy as np
import re
import os
import pysam
from textwrap import wrap
from tqdm import tqdm

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/variants/'

In [3]:
gnomAD_variants = pd.read_csv(data_dir + 'prefiltered/gnomAD/gnomAD_GRCh38.utr3.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t')

is_low_frequency = gnomAD_variants['info'].apply(lambda x: re.search('AC=([0-9]+)',x).groups()[0]=='1') #putative functional variants with AC=1
gnomAD_variants.loc[is_low_frequency,'label'] = 1 
gnomAD_variants.loc[~is_low_frequency,'label'] = 0 #putative non-functional, must be only with AF>5% (see dataprep pipeline)
gnomAD_variants['source'] = 'gnomAD'

clinvar_variants = pd.read_csv(data_dir + 'prefiltered/clinvar/clinvar.3utr.pathogenic.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t') #clinvar (Likely) pathogenic
clinvar_variants['source'] = 'clinvar'
clinvar_variants['label'] = 1

eQTL_susie = pd.read_csv(data_dir + 'prefiltered/eQTL-susie/eQTL.3utr.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t') #eQTLs with pvalue<1e-30
eQTL_susie['pvalue'] = eQTL_susie['info'].apply(lambda x: re.search('pvalue=([0-9\.\-eE]+)',x).groups()[0]).astype(float)
#eQTL_susie = eQTL_susie[pvalue<1e-70]

eQTL_susie['source'] = 'eQTL-susie'
eQTL_susie['label'] = 1

eQTL_GRASP = pd.read_csv(data_dir + 'prefiltered/GRASP/GRASP2eQTL.3utr.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t') #eQTLs with pvalue<1e-30
eQTL_GRASP['pvalue'] = eQTL_GRASP['info'].apply(lambda x: re.search('pvalue=([0-9\.\-eE]+)',x).groups()[0]).astype(float)

eQTL_GRASP['source'] = 'eQTL-GRASP'
eQTL_GRASP['label'] = 1

variants_df = pd.concat([clinvar_variants,eQTL_GRASP,eQTL_susie,gnomAD_variants])

In [4]:
variants_df['ref'] = variants_df['info'].apply(lambda x: re.search('GT=([ACTG/]+)',x).groups()[0].split('/')[0]) 
variants_df['alt'] = variants_df['info'].apply(lambda x: re.search('GT=([ACTG/]+)',x).groups()[0].split('/')[1]) 

In [5]:
#variants_df = variants_df[(variants_df.ref.isin(list('ACGT')))&(variants_df.alt.isin(list('ACGT')))] #only correct SNPs

In [6]:
len_ref = variants_df.ref.apply(len)
len_alt = variants_df.alt.apply(len)

variants_df.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'
variants_df.loc[(len_ref>1)&(len_alt==1),'vartype'] = 'DEL'
variants_df.loc[(len_ref==1)&(len_alt>1),'vartype'] = 'INS'

variants_df = variants_df[variants_df.vartype=='SNP'] #take only SNPs

  variants_df.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'


In [7]:
variants_df.drop_duplicates(subset=['chrom','pos'], inplace=True) #drop gnomAD variants that overlap with positive variants

In [8]:
variants_df = variants_df[(variants_df.source!='eQTL-GRASP')|(variants_df.pvalue<=1e-12)]
variants_df = variants_df[(variants_df.source!='eQTL-susie')|(variants_df.pvalue<=1e-12)]

In [9]:
variants_df['seq_name'] = variants_df['info'].apply(lambda x: re.search('seq_name=([^;]+)',x).groups()[0])

In [10]:
variants_df.drop(columns=['info','pvalue'], inplace=True) # we don't need infor column anymore

In [11]:
variants_df.seq_name = variants_df.seq_name.apply(lambda x:x.split(',')[0] if ',' in x else x) #take only the 1st region when multiple regions assigned

In [12]:
#3'UTR table

utr_table = pd.read_csv(data_dir + '../UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [13]:
variants_df = variants_df.merge(utr_table, how='left')

In [14]:
#compute the relative position of the variant in the sequence
variants_df.loc[variants_df.strand=='+','pos_rel'] = variants_df[variants_df.strand=='+'].pos - variants_df[variants_df.strand=='+'].seq_start
variants_df.loc[variants_df.strand=='-','pos_rel'] = variants_df[variants_df.strand=='-'].seq_end - variants_df[variants_df.strand=='-'].pos-1 #position on the reverse complemented sequence
variants_df.pos_rel = variants_df.pos_rel.astype(int)

In [15]:
#variants_df = variants_df[variants_df.pos_rel<5000] #limit by the maximum sequence length of trained models

In [16]:
# for each positive variant assign a negative variant
#
#N_pairs = 10000
#
#neg_df = variants_df[variants_df.label==0]
#pos_df = variants_df[variants_df.label==1]
#
#variants_df['bucket'] = None
#variants_df['split'] = None
#
#last_bucket_source = {source:0 for source in variants_df.source.unique()} #index of last positive-negative pair for each source
#
#for chrom in tqdm(pos_df.chrom.unique()):
#    chrom_df_pos = pos_df[pos_df.chrom==chrom].copy()
#    chrom_df_neg = neg_df[neg_df.chrom==chrom].copy()
#    for pos_idx, pos in chrom_df_pos.pos.items():
#        if len(chrom_df_neg)==0:
#            break
#        neg_idx_nearest = np.searchsorted(chrom_df_neg.pos, pos) #nearest negative variant index
#        neg_idx = chrom_df_neg.index[min(neg_idx_nearest,len(chrom_df_neg)-1)]
#        pos_diff = abs(chrom_df_neg.loc[neg_idx,'pos']-pos) #difference between the variant positions in the pair
#        source = variants_df.loc[pos_idx,'source'] #source of the positive variant
#        if abs(pos_diff)<1e10:
#            variants_df.loc[[neg_idx,pos_idx],'bucket']=last_bucket_source[source]
#            variants_df.loc[[neg_idx,pos_idx],'split']= source
#            last_bucket_source[source] += 1
#            chrom_df_neg.drop(neg_idx, inplace=True) # once the negative variant is chosen we can't take it anymore
#
#variants_df = variants_df[~variants_df.split.isna()] #remove variants without a pair
#
#limit the number of variant pairs from each source to N_pairs
#for split in ('gnomAD','clinvar','eQTL'):
#    buckets_chosen = np.random.choice(np.arange(last_bucket_source[split]), replace=False, size=min(N_pairs,last_bucket_source[split]))
#    variants_df = variants_df[(variants_df.split!=split)|(variants_df.bucket.isin(buckets_chosen))]
#
#variants_df.groupby('split').label.value_counts()

In [17]:
#variants_df.to_csv(data_dir+'perbase_pred/variants_snp_balanced.tsv', sep='\t', index=None) #POSITIONS are 0-based!

In [18]:
# take at most N_vars positive variants for each split
# for each split, choose N_vars negative variants
# negative variants for different splits will not overlap

N_vars = 10000

df = []

neg_df = variants_df[variants_df.label==0].copy()

for source in ('gnomAD','clinvar','eQTL-GRASP','eQTL-susie'):
    pos_source = variants_df[(variants_df.source==source)&(variants_df.label==1)]
    pos_source = pos_source.sample(n=min(len(pos_source),N_vars), replace=False, random_state=1)
    neg_source = neg_df.sample(n=N_vars, replace=False, random_state=1)
    pos_source['split'] = source
    neg_source['split'] = source
    df.append(pos_source)
    df.append(neg_source)
    if not 'eQTL' in source:
        neg_df.drop(neg_source.index, inplace=True)

df = pd.concat(df)

variants_df = df[~df.split.isna()] #remove variants without a pair

variants_df.groupby('split').label.value_counts()

split       label
clinvar     0.0      10000
            1.0        249
eQTL-GRASP  0.0      10000
            1.0        751
eQTL-susie  1.0      10000
            0.0      10000
gnomAD      0.0      10000
            1.0      10000
Name: count, dtype: int64

In [19]:
variants_df.to_csv(data_dir+'selected/variants_snp.tsv', sep='\t', index=None) #POSITIONS are 0-based!

# Seed variants to sequences

we will create fasta files for foundation models to get ref and alt embeddings for each variant

In [20]:
variants_df = pd.read_csv(data_dir+'selected/variants_snp.tsv', sep='\t') 

In [21]:
variants_df = variants_df.sort_values(by='label',ascending=False)

In [22]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [23]:
def extract_utr_seq(var, seq_len, rc_on_negative=False):

    refseq = human_fasta.fetch(var.chrom, var.pos-seq_len//2, var.pos+seq_len//2).upper()
    
    assert len(refseq)==seq_len
    
    varpos = seq_len//2 #relative variant position in the sequence

    assert not 'N' in refseq #avoid N's for NT models

    altseq = list(refseq)
     
    if var.vartype!='DEL':
        if refseq[varpos] != var.ref:
            return (None, None)
        else:
            altseq[varpos] = var.alt
    else:
        if refseq[varpos:varpos+len(var.ref)] != var.ref:
            return (None, None)
        else:
            altseq[varpos+1:varpos+len(var.ref)] = ''
                 
    altseq = ''.join(altseq)

    #for genes on the negative strand, take reverse complement
    if rc_on_negative and var.strand=='-':
        altseq = reverse_complement(altseq)
        refseq = reverse_complement(refseq)

    return refseq, altseq

In [24]:
human_fasta = pysam.FastaFile('/lustre/groups/epigenereg01/workspace/projects/vale/human_genomes/hg38.fa')

In [25]:
variants_df.loc[:,'var_id'] = variants_df.chrom + '_' + variants_df.pos.astype(str) + '_' + variants_df.ref + '_' + variants_df.alt

In [26]:
variants_df.drop_duplicates(subset='var_id', inplace=True)

In [27]:
def write_fasta(fasta_path, seq_len, rc_on_negative):

    c = 0
    
    with open(fasta_path, 'w') as f:
        for idx, var in tqdm(variants_df.iterrows(), total=len(variants_df)):
            refseq, altseq = extract_utr_seq(var, seq_len, rc_on_negative)
            assert refseq!=altseq
            assert refseq!=None
            f.write(f'>{var.var_id}_ref\n')
            for line in wrap(refseq, 80): #wrap reference sequence with standard FASTA width
                f.write(line+'\n')
            f.write(f'>{var.var_id}_alt\n')
            for line in wrap(altseq, 80): #wrap alternative sequence with standard FASTA width
                f.write(line+'\n')
            c += 1
    return c

In [28]:
for seq_len in (1024,):
    
    N_pairs=write_fasta(data_dir + f'selected/variants_dna_fwd_{seq_len}bp.fa',seq_len,False)
    print(f'{N_pairs} pairs written')

    N_pairs=write_fasta(data_dir + f'selected/variants_rna_{seq_len}bp.fa',seq_len,True)
    print(f'{N_pairs} pairs written')


100%|██████████| 51000/51000 [00:45<00:00, 1123.89it/s]


51000 pairs written


100%|██████████| 51000/51000 [00:21<00:00, 2387.94it/s]

51000 pairs written



