## Create variant tensor images

We construct variant tensor images for each variant in a given VCF file.

When required, data augmentation can be applied by exchanging mutation signatures 
between piledup reads in a random pair of variants.

Output images are grouped in batches of size Lbatch.

In [1]:
import pickle
import os
import sys
import time
from typing import *

import numpy as np
import pandas as pd

import pysam #library for reading VCF files

sys.path.append("python/")

from variant_to_image import variant_to_image #function to form image out of a variant

## Auxiliary functions

In [2]:
class dotdict(dict):
    """
    Dictionary with dot.notation access to attributes
    """
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [4]:
def dump_batch(batch, info, batch_path): 
    '''
    Write a batch of images on the disk
    '''
    
    #print(batch_path)
    
    if not SIMULATE:
        with open(batch_path, 'wb') as f:
            pickle.dump({'images':batch, 'info':info},f)

In [16]:
def make_images(vcf :str,                             #full path to a VCF file with the variants
               bam_dir: str,                          #directory with corresponding BAM files
               output_dir :str,                       #output dir for images batches
               refgen_fa :str,                        #reference genome FASTA file
               image_opts :Dict,                      #options for variant tensor encoding
               chrom :Optional[str] = None,           #chromosome name
               start :Optional[int] = None,           #min position in the chromosome
               stop :Optional[int] = None,            #max position in the chromosome
               Lbatch :int = 32,                      #how many images put in each batch
               bam_matching_csv :str = '',            #matching table between BAM sample name and BAM file name
               max_variants :Optional[int] = None,    #stop when this number of variants is reached
               data_augmentation :bool = False,         #use data augmentation
               data_augmentation_random_state :int = 0  #random state for data augmentation
             ):         
    '''
    Create a pileup image for each variant in the given VCF file.
    
    For each variant a sample BAM file is required.
    BAM file name can be encoded directly as a record BAM=bam_file_name.bam in the VCF INFO field (without the path).
    Otherwise, it is inferred from the sample name in the VCF file using bam_matching_csv.
    
    Images a packed in batches of size Lbatch.
    Depending on the global SIMULATE value the batches are saved to the disk.
    To avoid file system issues, we distribute batches into subfolders in the output_dir.

    To keep record of variants created, variant annotations (DP, VAF etc...) are added to the variants_df dataframe.
    To speed up processing, they are first accumulated in variants_list and added to the variants_df only when 1000 variants are accumulated.
    
    Image options (width, height etc...) are defined in the image_opts dictionary.
    See the variant_to_image function to learn more about image options.
    
    Data augmentation is possible within variants from the given VCF by exchanging mutational context between different pileup images.
    '''
    
    images_per_subdir = 100*Lbatch #maximum images per subdir
    
    variants_df = pd.DataFrame(columns=["vcf", "vcf_record_idx","chrom","pos","refpos","ref","alt","ref_support","old_chrom","old_pos","old_refpos","old_ref","old_alt","gnomAD_AF", "GERMLINE", "BAM","VAF","DP","image_height", "batch_name", "subdir"]) # DataFrame for variant annotations 
    
    if not SIMULATE:
        os.makedirs(output_dir, exist_ok=True)
        
    if bam_matching_csv:
        #matching table between BAM sample name and BAM file name
        #otherwise, the INFO filed of the VCF file should have the BAM=bam_file_name.bam record 
        bam_matching = pd.read_csv(bam_matching_csv, names=['BAM_sample', 'BAM_file'], squeeze=True, index_col=0)
 
    if data_augmentation:
        #replacement_df is a shuffled version of the original VCF
        vcf_df = pd.read_csv(input_params.vcf, sep = '\t', comment = '#', names = ['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'])
        random_state = data_augmentation_random_state + sum([ord(x) for x in chrom]) #use different random state for each chromosome
        replacement_df = vcf_df[['chrom', 'pos', 'ref', 'alt']].sample(frac = 1, random_state = random_state)
        replacement_df.chrom = replacement_df.chrom.astype(str)

    vcf_in = pysam.VariantFile(vcf) #open the VCF file
    
    all_samples = list(vcf_in.header.samples) #extract BAM sample names from the VCF header
                    
    variants_batch = [] #current batch of images
        
    variants_list = []  #we will first accumulate variant annotations in a list and then add this list to the data frame
    
    N_variants_added = 0 #total number of variants added
        
    #iterate over the records of the vcf file
    for vcf_record_idx, rec in enumerate(iter(vcf_in.fetch(contig = chrom, start = start, stop = stop))):
        
        if vcf_record_idx%images_per_subdir==0:
            #switch to a new subdir if the current one already has enough batches
            batch_subdir = str(vcf_record_idx//images_per_subdir)
            os.makedirs(os.path.join(output_dir, batch_subdir), exist_ok = True)
                        
        if max_variants and vcf_record_idx > max_variants:
            break
            
        #in a VCF file we have BAM sample names and we need the names of corresponding BAM files        
        if 'BAM' in rec.info.keys():
            #if the BAM file name is included in the VCF record
            bam_file_name = rec.info.get('BAM')[0].replace('.bam','')+'.bam' #when the BAM file name is defined in the INFO field
            bam_file_names = [bam_file_name] #for compatibility
        else:
            #otherwise, get the file name from the matching table
            bam_sample_names = [s for s in all_samples if rec.samples[s]['GT']!=(None,None)]
            bam_file_names = bam_matching.loc[bam_sample_names]

        #loop over all BAM files that have this variant
        for bam_file_name in bam_file_names:
                    
                #DP = rec.samples[bam_sample_name]['DP'] 
                #AD = rec.samples[bam_sample_name]['AD'] 
                #if DP==0:
                #    continue   
                #VAF = AD[1]/AD.sum()*100    
                
                #extract variant annotations
                
                variant_annotations = {}
                
                for ann_name in ['gnomAD_AF', 'GERMLINE']: #extra annotations from the VCF INFO field
                    if ann_name in rec.info.keys():
                        variant_annotations[ann_name] = rec.info.get(ann_name)
                    else:
                        variant_annotations[ann_name] = None
 
                bam_path = os.path.join(bam_dir, bam_file_name) #full path to the BAM file
                
                variant = {'pos':rec.pos, 'refpos':rec.pos, 'chrom':rec.chrom, 'ref':rec.ref, 'alt':rec.alts[0]}
                                
                if data_augmentation:
                    variant_to_replace = replacement_df.iloc[vcf_record_idx].to_dict()
                    variant_to_replace['refpos'] = variant_to_replace['pos']
                else:
                    variant_to_replace = None

                try:
                    
                    #get a tensor variant image for the current variant
                    image, ref_support, VAF, DP = variant_to_image(variant, refgen_fa, bam_path,
                        new_variant = variant_to_replace, **image_opts)

                except Exception as exc:
                    
                    print('Exception occured while creating a variant image')
                    print('Variant:\n', variant)
                    print('Varinat to replace:\n', variant_to_replace)
                    print('Reference FASTA file:\n', refgen_fa)
                    print('BAM file:\n', bam_path)
                    print('Error message:\n', exc)

                    continue

                variants_batch.append(image) #add current variant to the batch
                                                
                variant_record = {
                     'vcf_record_idx':vcf_record_idx,
                     'subdir': batch_subdir,
                     'BAM': bam_file_name,
                     'VAF': VAF,
                     'DP': DP,
                     'image_height':image['p_hot_reads'].shape[0],
                     'ref_support': ref_support,
                     'data_augmentation_random_state': data_augmentation_random_state,
                    }
                
                variant_record.update(variant_annotations) #add extra annotations from the VCF INFO field
                
                if data_augmentation:
                    
                        variant_record.update(variant_to_replace)
                        variant_record.update({'old_'+ann_name:ann_value for ann_name,ann_value in variant.items()})

                else:
                        variant_record.update(variant)
                        variant_record.update({'old_'+ann_name:None for ann_name in variant.keys()})


                variants_list.append(variant_record)

                N_variants_added += 1
                
                if N_variants_added%Lbatch == 0:
                    
                    #save the batch to the disk when it is full
                                  
                    batch_name = f'{variants_list[-Lbatch]["batch_name"]}.imgb' #batch name: VCF record index of the 1st variant in the batch
                    
                    for i in range(-Lbatch,0):
                        variants_list[i]['batch_name']=batch_name #mark batch name in the variants list
    
                    if not SIMULATE:
                        #save batch to the disk
                        dump_batch(variants_batch, variants_list[-Lbatch:], os.path.join(*[output_dir, batch_subdir, batch_name]))
                
                    variants_batch = [] #empty current batch
   
                    if  len(variants_list)>1000:
                        #add variants_list to variants_df every 1000 images
                        variants_df = variants_df.append(variants_list, ignore_index=True)
                        variants_list = []    

    N_batch = len(variants_batch)
        
    if N_batch:
        
        print(N_batch)
        
        batch_name = f'{variants_list[-Lbatch]["batch_name"]}.imgb' #batch name: VCF record index of the 1st variant in the batch
        
        for i in range(-N_batch,0):
            variants_list[i]['batch_name']=batch_name #mark batch name in the variants list
                    
        if not SIMULATE:
            #save batch to the disk
            dump_batch(variants_batch, variants_list[-N_batch:], os.path.join(*[output_dir,  batch_subdir, batch_name]))
    
    variants_df = variants_df.append(variants_list, ignore_index=True)

    return variants_df

## Main workflow

In [17]:
input_params = dotdict({})

input_params.vcf = '/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/MLL/MLL_variants_list_20200804/vcfs/somatic_test.vcf.gz'#'/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/GACA-CN/gnomAD_thr_0/vcfs/negative_train_nn.vcf.gz' #vcf file with variants
input_params.output_dir = '/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/MLL/MLL_variants_list_20200804/images/somatic_test2'#'/storage/groups/epigenereg01/workspace/projects/vale/datasets/snvs/GACA-CN/gnomAD_thr_0/images/' #output dir name
input_params.chrom = None#'1' #chromosome name, to limit images generation to a particular contig
input_params.bam_dir = '/storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/'#'/storage/groups/epigenereg01/workspace/projects/vale/data/icgc/GACA-CN/bam/' #folder with BAM files
input_params.refgen_fa = '/storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/resources_GRCh37/GRCh37.fa' #Reference genome FASTA file
input_params.data_augmentation = 0 #use data augmentation by exchanging mutation context between reads 
input_params.data_augmentation_random_state = 0 #random state for data augmentation
input_params.Lbatch = 1#4 #size of images batches
input_params.bam_matching_csv = '' #matching table between BAM sample name and BAM file name (see make_images)
input_params.image_width = 150 # image width: 2x the most probable read length
input_params.image_max_height = 50 #max image height, the probability to have a read depth above this value should be small
input_params.image_crop_strategy = 'topbottom' #how to crop variant image when read depth>image_max_height
input_params.image_sort_by_variant = True #sort reads by base in the variant column
input_params.image_check_variant_column = 0 #check if the variant is present in actual pileup

In [18]:
SIMULATE = 0 #simulate, don't create any folders or wite any images to disk

In [19]:
if not SIMULATE:
    os.makedirs(input_params.output_dir, exist_ok = True)

In [20]:
gen_params =  {
'start': None, #start from this record in the VCF
'stop': None,  #stop at this record in the VCF
'max_variants': 10, #maximum number of variants from this VCF to consider
} #how to treat input VCF

image_opts = dict() #parameters for the variant_to_image function

for param,value in input_params.items():
    #from input parameters, separate parameters for make_images and variant_to_image functions
    if not param.startswith('image_'):
        gen_params[param] = value
    else:
        image_opts[param] = value
        
if gen_params['chrom'] != None:
    #if we are limited to a particular contig, put generated images in a dedicated folder
    gen_params['output_dir'] = os.path.join(gen_params['output_dir'], gen_params['chrom'])
    
t0 = time.time()

variants_df = make_images(image_opts = image_opts, **gen_params) #dataframe with annotations of processed variants

vcf_name = os.path.basename(input_params.vcf).replace('.vcf.gz', '') #VCF base name without extentension

variants_df['vcf'] = vcf_name

variants_df.to_csv(os.path.join(gen_params['output_dir'], "variants.csv.gz"))

t_exec = time.time() - t0 #total execution time

print(f"{gen_params['output_dir']}\nFinished successfully. Execution time: {t_exec//60:.0f}m {t_exec%60:.1f}s.")

Exception occured while creating a variant image
Variant:
 {'pos': 115258744, 'refpos': 115258744, 'chrom': '1', 'ref': 'C', 'alt': 'T'}
Varinat to replace:
 None
Reference FASTA file:
 /storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/resources_GRCh37/GRCh37.fa
BAM file:
 /storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/p_0_59.tumor.bam
Error message:
 BAM file not found
Exception occured while creating a variant image
Variant:
 {'pos': 115258748, 'refpos': 115258748, 'chrom': '1', 'ref': 'C', 'alt': 'T'}
Varinat to replace:
 None
Reference FASTA file:
 /storage/groups/epigenereg01/workspace/projects/vale/calling/MLL/resources_GRCh37/GRCh37.fa
BAM file:
 /storage/groups/epigenereg01/datasets/MLL-5000-genomes/matched_pairs/BAM/p_0_52.tumor.bam
Error message:
 BAM file not found
Exception occured while creating a variant image
Variant:
 {'pos': 73565732, 'refpos': 73565732, 'chrom': '10', 'ref': 'G', 'alt': 'T'}
Varinat to replace:
 None
Reference 