## Create variant tensors

We construct variant tensors for each variant in a given VCF file.

In [1]:
import pickle
import os
import sys
import time
from typing import *

import numpy as np
import pandas as pd

import pysam #library for reading VCF files

sys.path.append("python/")

from variant_to_tensor import variant_to_tensor #function to form a tensor out of a variant

## Auxiliary functions

In [2]:
class dotdict(dict):
    """
    Dictionary with dot.notation access to attributes
    """
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [3]:
def dump_batch(batch, info, batch_path): 
    '''
    Write a batch of tensors on the disk
    '''
    #print(batch_path)
    
    if not SIMULATE:
        with open(batch_path, 'wb') as f:
            pickle.dump({'images':batch, 'info':info},f)

In [9]:
def gen_tensors(vcf :str,                             #full path to a VCF file with the variants
               bam_dir: str,                          #directory with corresponding BAM files
               output_dir :str,                       #output dir for tensor batches
               refgen_fa :str,                        #reference genome FASTA file
               tensor_opts :Dict,                     #options for variant tensor encoding
               Lbatch :Optional[int] = 1,             #how many tensors put in each batch
               chrom :Optional[str] = None,           #chromosome name
               max_variants :Optional[int] = None,    #stop when this number of variants is reached
               bam_matching_csv :Optional[str] = '',  #matching table between BAM sample name and BAM file name
             ):         
    '''
    Create a pileup tensor for each variant in the given VCF file.
    
    For each variant a sample BAM file is required.
    BAM file name can be encoded directly as a record BAM=bam_file_name.bam in the VCF INFO field (without the path).
    Otherwise, it is inferred from the sample name in the VCF file using bam_matching_csv.
    
    Tensors a packed in batches of size Lbatch.
    Depending on the global SIMULATE value the batches are saved to the disk.
    To avoid file system issues, we distribute batches into subfolders in the output_dir.

    To keep record of variants created, variant annotations (DP, VAF etc...) are added to the variants_df dataframe.
    To speed up processing, they are first accumulated in variants_list and added to the variants_df only when 1000 variants are accumulated.
    
    Tensor options (width, height etc...) are defined in the tensor_opts dictionary.
    See the variant_to_tensor function to learn more about tensor options.
    '''
    
    tensors_per_subdir = 100*Lbatch #maximum tensors per subdir
    
    variants_df = pd.DataFrame(columns=["vcf", "vcf_record_idx", "chrom", "pos", "ref", "alt", "ref_support", "BAM", "VAF", "DP", "tensor_height", "batch_name", "subdir"]) # DataFrame for variant annotations 
    
    if not SIMULATE:
        os.makedirs(output_dir, exist_ok=True)
        
    if bam_matching_csv:
        #matching table between BAM sample name and BAM file name
        #otherwise, the INFO filed of the VCF file should have the BAM=bam_file_name.bam record 
        bam_matching = pd.read_csv(bam_matching_csv, names=['BAM_sample', 'BAM_file'], squeeze=True, index_col=0)
        bam_matching = bam_matching.apply(lambda x:x.replace('.bam','')+'.bam')

    vcf_in = pysam.VariantFile(vcf) #open the VCF file
    
    all_samples = list(vcf_in.header.samples) #extract BAM sample names from the VCF header
                    
    variants_batch = [] #current batch of tensors
        
    variants_list = []  #we will first accumulate variant annotations in a list and then add this list to the data frame
    
    N_variants_added = 0 #total number of variants added
        
    #iterate over the records of the vcf file
    for vcf_record_idx, rec in enumerate(iter(vcf_in.fetch(contig = chrom))):
        
        if vcf_record_idx%tensors_per_subdir==0:
            #switch to a new subdir if the current one already has enough batches
            batch_subdir = str(vcf_record_idx//tensors_per_subdir)
            os.makedirs(os.path.join(output_dir, batch_subdir), exist_ok = True)
                        
        if max_variants and N_variants_added >= max_variants:
            break
            
        #in a VCF file we have BAM sample names and we need the names of corresponding BAM files        
        if bam_matching_csv:
            #get the file name from the matching table
            bam_sample_names = [s for s in all_samples if rec.samples[s]['GT']!=(None,None)]
            bam_file_names = bam_matching.loc[bam_sample_names]
        else:
            #otherwise, use BAM file name from the VCF record
            bam_file_name = rec.info.get('BAM')[0].replace('.bam','')+'.bam' #when the BAM file name is defined in the INFO field
            bam_file_names = [bam_file_name] #for compatibility


        #loop over all BAM files that have this variant
        for bam_file_name in bam_file_names:
 
                bam_path = os.path.join(bam_dir, bam_file_name) #full path to the BAM file
                
                variant = {'pos':rec.pos, 'refpos':rec.pos, 'chrom':rec.chrom, 'ref':rec.ref, 'alt':rec.alts[0]}

                try:
                    
                    #get a tensor variant tensor for the current variant
                    variant_tensor, ref_support, VAF, DP = variant_to_tensor(variant, refgen_fa, bam_path, check_variant="snps",
                         **tensor_opts) 

                except Exception as exc:
                    
                    print('-------------------------------------------------')
                    print('Exception occured while creating a variant tensor')
                    print('Variant:\n', variant)
                    print('Reference FASTA file:\n', refgen_fa)
                    print('BAM file:\n', bam_path)
                    print('Error message:\n', exc)

                    continue

                variants_batch.append(variant_tensor) #add current variant to the batch
                                                
                variant_record = {
                     'vcf_record_idx':vcf_record_idx,
                     'subdir': batch_subdir,
                     'BAM': bam_file_name,
                     'VAF': VAF,
                     'DP': DP,
                     'tensor_height':variant_tensor['p_hot_reads'].shape[0],
                     'ref_support': ref_support,
                    }
                                
                variants_list.append(variant_record)

                N_variants_added += 1
                
                if N_variants_added%Lbatch == 0:
                    
                    #save the batch to the disk when it is full
                                  
                    batch_name = f'{variants_list[-Lbatch]["vcf_record_idx"]}.imgb' #batch name: VCF record index of the 1st variant in the batch
                    
                    for i in range(-Lbatch,0):
                        variants_list[i]['batch_name']=batch_name #mark batch name in the variants list
    
                    if not SIMULATE:
                        #save batch to the disk
                        dump_batch(variants_batch, variants_list[-Lbatch:], os.path.join(*[output_dir, batch_subdir, batch_name]))
                
                    variants_batch = [] #empty current batch
   
                    if  len(variants_list)>1000:
                        #add variants_list to variants_df every 1000 tensors
                        variants_df = variants_df.append(variants_list, ignore_index=True)
                        variants_list = []    

    N_batch = len(variants_batch)
        
    if N_batch:
                
        batch_name = f'{variants_list[-Lbatch]["batch_name"]}.imgb' #batch name: VCF record index of the 1st variant in the batch
        
        for i in range(-N_batch,0):
            variants_list[i]['batch_name']=batch_name #mark batch name in the variants list
                    
        if not SIMULATE:
            #save batch to the disk
            dump_batch(variants_batch, variants_list[-N_batch:], os.path.join(*[output_dir,  batch_subdir, batch_name]))
    
    variants_df = variants_df.append(variants_list, ignore_index=True)

    return variants_df

## Main workflow

In [10]:
input_params = dotdict({})

input_params.vcf = '/storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/germline/filtered/vqsr/99.9/t.vcf.gz'#'/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/GACA-CN/gnomAD_thr_0/vcfs/negative_train_nn.vcf.gz' #vcf file with variants
input_params.output_dir = '/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/test'#'/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/GACA-CN/gnomAD_thr_0/images/' #output dir name
input_params.bam_dir = '/storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/'#'/storage/groups/epigenereg01/workspace/projects/vale/data/icgc/GACA-CN/bam/' #folder with BAM files
input_params.refgen_fa = '/storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/resources_GRCh37/GRCh37.fa' #Reference genome FASTA file
input_params.Lbatch = 1#4 #size of tensors batches
input_params.chrom = None#'1' #chromosome name, to limit tensors generation to a particular contig
input_params.bam_matching_csv = '/storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/bam_matching.csv' #matching table between BAM sample name and BAM file name (see gen_tensors)
input_params.max_variants = None #maximum number of variants from the VCF to consider
input_params.tensor_width = 150 # tensor width: 2x the most probable read length
input_params.tensor_max_height = 50 #max tensor height, the probability to have a read depth above this value should be small
input_params.tensor_crop_strategy = 'topbottom' #how to crop variant tensor when read depth>tensor_max_height
input_params.tensor_sort_by_variant = True #sort reads by base in the variant column
input_params.tensor_check_variant_column = True #check if the variant is present in actual pileup

In [11]:
SIMULATE = 0 #simulate, don't create any folders or wite any tensors to disk

In [12]:
if not SIMULATE:
    os.makedirs(input_params.output_dir, exist_ok = True)

In [13]:
tensor_opts = dict() #parameters for the variant_to_tensor function

gen_params = dict() #parameters for the gen_tensors function

for param,value in input_params.items():
    #from input parameters, separate parameters for gen_tensors and variant_to_tensor functions
    if not param.startswith('tensor_'):
        gen_params[param] = value
    else:
        tensor_opts[param] = value
        
if gen_params['chrom'] != None:
    #if we are limited to a particular contig, put generated tensors in a dedicated folder
    gen_params['output_dir'] = os.path.join(gen_params['output_dir'], gen_params['chrom'])
    
t0 = time.time()

variants_df = gen_tensors(tensor_opts = tensor_opts, **gen_params) #dataframe with annotations of processed variants

vcf_name = os.path.basename(input_params.vcf).replace('.vcf.gz', '') #VCF base name without extentension

variants_df['vcf'] = vcf_name

variants_df.to_csv(os.path.join(gen_params['output_dir'], "variants.csv.gz"))

t_exec = time.time() - t0 #total execution time

print(f"{gen_params['output_dir']}\nFinished successfully. Execution time: {t_exec//60:.0f}m {t_exec%60:.1f}s.")
print(f'{len(variants_df)} variants is created, distributed over {len(variants_df.batch_name.unique())} batches')

-------------------------------------------------
Exception occured while creating a variant tensor
Variant:
 {'pos': 10583, 'refpos': 10583, 'chrom': '1', 'ref': 'G', 'alt': 'A'}
Reference FASTA file:
 /storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/resources_GRCh37/GRCh37.fa
BAM file:
 /storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/p_0_55.tumor.bam
Error message:
 '>' not supported between instances of 'NoneType' and 'int'
-------------------------------------------------
Exception occured while creating a variant tensor
Variant:
 {'pos': 66355, 'refpos': 66355, 'chrom': '1', 'ref': 'A', 'alt': 'T'}
Reference FASTA file:
 /storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/resources_GRCh37/GRCh37.fa
BAM file:
 /storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/p_0_2.tumor.bam
Error message:
 No reads with the alternative allele found in the variant column!
-------------------------------------------------
E

KeyboardInterrupt: 