In [20]:
import pandas as pd
import numpy as np
import re
import os
from textwrap import wrap
from tqdm import tqdm

In [21]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/variants/'

## Collect variants from all sources

In [22]:
clinvar_variants = pd.read_csv(data_dir + 'prefiltered/clinvar/clinvar.3utr.pathogenic.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t') #clinvar (Likely) pathogenic

flt = (clinvar_variants['info'].str.contains('no_assertion'))|(clinvar_variants['info'].str.contains('no_interpretation'))
clinvar_variants = clinvar_variants[~flt]

clinvar_variants['source'] = 'clinvar'
clinvar_variants['label'] = 1

In [23]:
eQTL_susie = pd.read_csv(data_dir + 'prefiltered/eQTL-susie/eQTL.3utr.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t') #eQTLs with pvalue<1e-30

eQTL_susie['pvalue'] = eQTL_susie['info'].apply(lambda x: re.search('pvalue=([0-9\.\-eE]+)',x).groups()[0]).astype(float)

eQTL_susie['source'] = 'eQTL-susie'
eQTL_susie['label'] = 1

In [24]:
gnomAD_variants = pd.read_csv(data_dir + 'prefiltered/gnomAD/gnomAD_GRCh38.utr3.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t')

is_low_frequency = gnomAD_variants['info'].apply(lambda x: re.search('AC=([0-9]+)',x).groups()[0]=='1') #putative functional variants with AC=1
gnomAD_variants['AF']=gnomAD_variants['info'].apply(lambda x: re.search('AF=([-e\.0-9]+)',x).groups()[0]).astype(float)

gnomAD_variants.loc[is_low_frequency,'label'] = 1 
gnomAD_variants.loc[~is_low_frequency,'label'] = 0 #putative non-functional, must be only with AF>5% (see dataprep pipeline)

gnomAD_variants['source'] = 'gnomAD'

In [25]:
variants_df = pd.concat([clinvar_variants,eQTL_susie,gnomAD_variants])

In [26]:
for vartype in ('snps',):
    CADD_pos = pd.read_csv(data_dir + f'prefiltered/CADD/CADD.3utr.{vartype}.pos.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t')
    CADD_pos['source'] = 'CADD'
    CADD_pos['label'] = 1
    variants_df = pd.concat([variants_df,CADD_pos])

for vartype in ('snps',):
    CADD_neg = pd.read_csv(data_dir + f'prefiltered/CADD/CADD.3utr.{vartype}.neg.bed', usecols=[0,1,3],names=['chrom','pos','info'], sep='\t')
    CADD_neg['source'] = 'CADD'
    CADD_neg['label'] = 0
    variants_df = pd.concat([variants_df,CADD_neg])

In [27]:
variants_df['ref'] = variants_df['info'].apply(lambda x: re.search('GT=([ACTG/]+)',x).groups()[0].split('/')[0]) 
variants_df['alt'] = variants_df['info'].apply(lambda x: re.search('GT=([ACTG/]+)',x).groups()[0].split('/')[1]) 

In [28]:
len_ref = variants_df.ref.apply(len)
len_alt = variants_df.alt.apply(len)

variants_df.loc[(len_ref==1)&(len_alt==1),'vartype'] = 'SNP'
variants_df.loc[(len_ref>1)&(len_alt==1),'vartype'] = 'DEL'
variants_df.loc[(len_ref==1)&(len_alt>1),'vartype'] = 'INS'

In [29]:
variants_df = variants_df[variants_df.vartype=='SNP'] #take only SNPs
#variants_df = variants_df[(variants_df.ref.isin(list('ACGT')))&(variants_df.alt.isin(list('ACGT')))] #only correct SNPs

In [30]:
variants_df.loc[:,'var_id'] = variants_df.chrom + '_' + variants_df.pos.astype(str) + '_' + variants_df.ref + '_' + variants_df.alt

In [31]:
variants_df.reset_index(inplace=True,drop=True)

gnomad_neg = variants_df.loc[(variants_df.source=='gnomAD')&(variants_df.label==0),['chrom','pos','ref','alt','AF']]

variants_df = variants_df.drop(columns='AF').merge(gnomad_neg, how='left')

In [32]:
#drop gnomAD negative variants that overlap with positive variants (excluding CADD since that's a separate dataset)
pos_vars = variants_df.loc[(variants_df.source!='CADD')&(variants_df.label==1),['chrom','pos','ref','alt']].drop_duplicates()

idx_to_drop = gnomad_neg.reset_index().merge(pos_vars,how='inner')['index'].values

variants_df.drop(idx_to_drop,inplace=True)

In [33]:
variants_df = variants_df[(variants_df.source!='eQTL-susie')|(variants_df.pvalue<=1e-25)]

In [34]:
variants_df['seq_name'] = variants_df['info'].apply(lambda x: re.search('seq_name=([^,;]+)',x).groups()[0])

In [35]:
variants_df.drop(columns=['info','pvalue'], inplace=True) # we don't need info column anymore

In [36]:
#3'UTR table

utr_table = pd.read_csv(data_dir + '../UTR_coords/GRCh38_3_prime_UTR_clean-sorted.bed', sep='\t',
                       header = None, names=['seq_start','seq_end','seq_name','strand'], usecols=[1,2,3,5])

In [37]:
variants_df = variants_df.merge(utr_table, how='left')

In [38]:
#if there're more than one 3'UTRs for a given variant, take the longest
variants_df['seq_len'] = variants_df.seq_end-variants_df.seq_start
variants_df = variants_df.sort_values(by='seq_len',ascending=False).drop_duplicates(['var_id','source'],keep='first')

In [68]:
variants_df['pos_rel'] = variants_df.pos-variants_df.seq_start #relative variant position w.r.t. 3'UTR sequence

In [69]:
# take at most N_vars positive variants for each split
# for each split, choose N_vars negative variants
# negative variants for different splits will not overlap

N_vars = 10_000

df = []

for source in ('CADD', 'gnomAD','clinvar','eQTL-susie'):
    
    pos_df = variants_df[(variants_df.label==1)&(variants_df.source==source)]
    
    pos_source = pos_df.sample(n=min(len(pos_df),N_vars), replace=False, random_state=1)
    pos_source['split'] = source
    df.append(pos_source)
    
    if source!='CADD':
        #for clinvar,gnomAD, and eQTL-, we use negative variants from gnomAD
        neg_df = variants_df[(variants_df.label==0)&(variants_df.source=='gnomAD')]
    else:
        #for CADD, we use negative variants from CADD
        neg_df = variants_df[(variants_df.label==0)&(variants_df.source=='CADD')]
    
    neg_source = neg_df.sample(n=min(len(neg_df),N_vars), replace=False, random_state=1) 
    neg_source['split'] = source
    df.append(neg_source)
    
df = pd.concat(df)

variants_df = df[~df.split.isna()] #remove variants without a pair

variants_df.groupby('split').label.value_counts()

split       label
CADD        0.0      10000
            1.0      10000
clinvar     0.0      10000
            1.0        261
eQTL-susie  0.0      10000
            1.0      10000
gnomAD      0.0      10000
            1.0      10000
Name: count, dtype: int64

In [77]:
(variants_df[variants_df.source=='eQTL-susie'].AF>0.05).mean()

0.9525

In [76]:
output_name= data_dir+'selected/variants_snp.tsv'

#POSITIONS are 0-based!
variants_df.to_csv(output_name, sep='\t', index=None) #POSITIONS are 0-based!

In [29]:
variants_df = pd.read_csv(data_dir+'selected/variants_snp.tsv', sep='\t') #POSITIONS are 0-based!

In [30]:
#create a smaller subset since computing variant influence score is too time-consuming
# we will compute vif on this subset

subset = variants_df.groupby(['split','label']).apply(lambda x:x.sample(n=min(3000,len(x)),replace=False,random_state=1),include_groups=False)

subset = subset.droplevel(-1).reset_index().drop_duplicates()

vis_whitelist = pd.concat([subset.var_id+'_ref',subset.var_id+'_alt'])

In [31]:
vis_whitelist.to_csv(data_dir+'selected/vis_whitelist.tsv', index=None,header=None)

In [32]:
ref_whitelist = variants_df.var_id + '_ref'
ref_whitelist.to_csv(data_dir+'selected/reference_allele.tsv', index=None,header=None)

# Seed variants to sequences

In [13]:
WINDOW_WIDTH = 4096
WINDOW_WIDTH_VIS = 2048

In [4]:
variants_df = pd.read_csv(data_dir+'selected/variants_snp.tsv', sep='\t') 

In [5]:
variants_df.drop_duplicates(subset='var_id', inplace=True)

In [6]:
import pysam

human_fasta = pysam.FastaFile(data_dir + '../fasta/hg38.fa')

In [7]:
def reverse_complement(seq):
    '''
    Take sequence reverse complement
    '''
    compl_dict = {'A':'T', 'C':'G', 'G':'C', 'T':'A','a':'t', 'c':'g', 'g':'c', 't':'a'}
    compl_seq = ''.join([compl_dict.get(x,x) for x in seq])
    rev_seq = compl_seq[::-1]
    return rev_seq

In [8]:
def extract_utr_seq(var, seq_len, 
                    rc_on_negative=False, mask_beyond_3utr=False,seq_clip_3utr=False,var_to_lower=False):

    left = var.pos-seq_len//2
    right = var.pos+seq_len//2

    if seq_clip_3utr:
        left = max(left,var.seq_start)
        right = min(right,var.seq_end)

    refseq = human_fasta.fetch(var.chrom, left, right).upper()
    
    varpos = var.pos-left#relative variant position in the sequence

    refseq = np.array(list(refseq))

    if mask_beyond_3utr:
        assert seq_clip_3utr==False
        refseq[:max(var.seq_start-(var.pos-seq_len//2),0)] = 'N'
        if var.pos+seq_len//2>var.seq_end:
            refseq[var.seq_end-(var.pos+seq_len//2):] = 'N'

    assert len(refseq)==right-left
            
    altseq = refseq.copy()
     
    if var.vartype!='DEL':
        if refseq[varpos] != var.ref:
            return (None, None)
        altseq[varpos] = var.alt
    else:
        if refseq[varpos:varpos+len(var.ref)] != var.ref:
            return (None, None)
        altseq[varpos+1:varpos+len(var.ref)] = ''

    if var_to_lower:
        refseq[varpos] = refseq[varpos].lower()
        altseq[varpos] = altseq[varpos].lower()

    altseq = ''.join(altseq)
    refseq = ''.join(refseq)
    
    #for genes on the negative strand, take reverse complement
    if rc_on_negative and var.strand=='-':
        altseq = reverse_complement(altseq)
        refseq = reverse_complement(refseq)

    return refseq, altseq

In [9]:
def write_fasta(fasta_path, **kwargs):

    c = 0
    
    with open(fasta_path, 'w') as f:
        for idx, var in tqdm(variants_df.iterrows(), total=len(variants_df)):
            refseq, altseq = extract_utr_seq(var, **kwargs)
            assert refseq!=None, f'{var}'
            f.write(f'>{var.var_id}_ref\n')
            for line in wrap(refseq, 80): #wrap reference sequence with standard FASTA width
                f.write(line+'\n')
            f.write(f'>{var.var_id}_alt\n')
            for line in wrap(altseq, 80): #wrap alternative sequence with standard FASTA width
                f.write(line+'\n')
            c += 1
    return c

### Create FASTA for variants

In [None]:
N_pairs=write_fasta(data_dir + f'selected/variants_dna_fwd.fa',seq_len=WINDOW_WIDTH,rc_on_negative=False,mask_beyond_3utr=False,
                   seq_clip_3utr=False,var_to_lower=True)

print(f'{N_pairs} pairs written')

N_pairs=write_fasta(data_dir + f'selected/variants_rna.fa',seq_len=WINDOW_WIDTH,rc_on_negative=True,mask_beyond_3utr=False,
                   seq_clip_3utr=True,var_to_lower=True)

print(f'{N_pairs} pairs written')

In [14]:
! samtools faidx {data_dir}/selected/variants_rna.fa

In [14]:
! samtools faidx {data_dir}/selected/variants_dna_fwd.fa

### Create FASTA files for VIS computation

We create smaller FASTA files for variant influence score since computing predictions for large sequences is time consuming

In [14]:
N_pairs=write_fasta(data_dir + f'selected/variants_VIS_dna_fwd.fa',seq_len=WINDOW_WIDTH_VIS,rc_on_negative=False,mask_beyond_3utr=False,
                   seq_clip_3utr=False,var_to_lower=True)

print(f'{N_pairs} pairs written')

N_pairs=write_fasta(data_dir + f'selected/variants_VIS_rna.fa',seq_len=WINDOW_WIDTH_VIS,rc_on_negative=True,mask_beyond_3utr=False,
                   seq_clip_3utr=True,var_to_lower=True)

print(f'{N_pairs} pairs written')

100%|██████████| 50228/50228 [10:47<00:00, 77.52it/s] 


50228 pairs written


100%|██████████| 50228/50228 [01:49<00:00, 460.74it/s]

50228 pairs written





In [15]:
! samtools faidx {data_dir}/selected/variants_VIS_rna.fa

In [16]:
! samtools faidx {data_dir}/selected/variants_VIS_dna_fwd.fa