In [6]:
### Function used on 'large_svs.vcf' files to extract all information as dictionaries ###

'''
Input: 

vcfreader (from the pyvcf library) of the file

    - The vcfreader basically reads the vcf files line by line

Outputs:

1. temp_sv: all the large SVs marked as DEL, DUP, or INV regardless of filtering status from the vcf file
2. temp_BND: all the large SVs marked as BND regardless of filtering status from the vcf file

    - Note that the outputs exclude SVs marked with an UNK status
    
''' 

def extract_SV_info_wo_UNK(vcfreader):
    
    temp_SV = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
               'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
               'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
    
    temp_BND = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                'SVTYPE': [], 'SVTYPE2': [], 'MATEID': [], 'QUAL': [], \
                'GT': [], 'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
    counter = 0
    
    for record in vcfreader:
        
        if str(record.INFO['SVTYPE']) == 'DEL' or str(record.INFO['SVTYPE']) == 'DUP' or \
        str(record.INFO['SVTYPE']) == 'INV':

            temp_SV['CHROM'].append(record.CHROM)
            temp_SV['POS'].append(record.POS)
            temp_SV['ID'].append(record.ID)
            temp_SV['REF'].append(record.REF)

            if len(record.ALT) != 1:
                print('ERROR')
                print('ALT: ' + str(record.ALT))

            temp_SV['ALT'].append(str(record.ALT[0]))
            temp_SV['SVTYPE'].append(str(record.INFO['SVTYPE']))
            temp_SV['QUAL'].append(record.QUAL)
            temp_SV['END'].append(record.INFO['END'])
            temp_SV['SVLEN'].append(record.INFO['SVLEN'])
            temp_SV['GT'].append(str(record.samples[0]['GT']))
            temp_SV['HAP_ALLELIC_FRAC'].append(record.INFO['HAP_ALLELIC_FRAC'])
            temp_SV['ALLELIC_FRAC'].append(record.INFO['ALLELIC_FRAC'])
            temp_SV['FILTER'].append(record.FILTER)

        elif str(record.INFO['SVTYPE']) == 'BND':
            
            temp_BND['CHROM'].append(record.CHROM)
            temp_BND['POS'].append(record.POS)
            temp_BND['ID'].append(record.ID)
            temp_BND['REF'].append(record.REF)

            if len(record.ALT) != 1:
                print('ERROR')
                print('ALT: ' + str(record.ALT))

            temp_BND['ALT'].append(str(record.ALT[0]))
            temp_BND['SVTYPE'].append(str(record.INFO['SVTYPE']))
            temp_BND['SVTYPE2'].append(str(record.INFO['SVTYPE2']))
            temp_BND['MATEID'].append(str(record.INFO['MATEID']))                    
            temp_BND['QUAL'].append(record.QUAL)
            temp_BND['GT'].append(str(record.samples[0]['GT']))
            temp_BND['HAP_ALLELIC_FRAC'].append(record.INFO['HAP_ALLELIC_FRAC'])
            temp_BND['ALLELIC_FRAC'].append(record.INFO['ALLELIC_FRAC'])
            temp_BND['FILTER'].append(record.FILTER)

        counter += 1
        
    return temp_SV, temp_BND

In [7]:
### Debugging function, prints all contents from the SV dictionary line by line ###

def debug_sv(sv_dict):
    
    for i in range(len(sv_dict['CHROM'])):
        
        print('index: ' + str(i))
        print('CHROM: ' + str(sv_dict['CHROM'][i]))
        print('POS: ' + str(sv_dict['POS'][i]))
        print('ID: ' + str(sv_dict['ID'][i]))
        print('REF: ' + str(sv_dict['REF'][i]))
        print('ALT: ' + str(sv_dict['ALT'][i]))
        print('SVTYPE: ' + str(sv_dict['SVTYPE'][i]))
        print('QUAL: ' + str(sv_dict['QUAL'][i]))
        print('END: ' + str(sv_dict['END'][i]))
        print('SVLEN: ' + str(sv_dict['SVLEN'][i]))
        print('GT: ' + str(sv_dict['GT'][i]))
        print('HAP_ALLELIC_FRAC: ' + str(sv_dict['HAP_ALLELIC_FRAC'][i]))
        print('ALLELIC_FRAC: ' + str(sv_dict['ALLELIC_FRAC'][i]))
        print('FILTER: ' + str(sv_dict['FILTER'][i]))

In [8]:
### Function used on 'dels.vcf' files to extract all information as dictionaries ###

'''
Input: 

vcfreader (from the pyvcf library) of the file

    - The vcfreader basically reads the vcf files line by line

Outputs:

1. temp_del: all the small dels regardless of filtering status from the vcf file
2. temp_BND: all the dels marked as BND regardless of filtering status from the vcf file

    - Note that the outputs exclude SVs marked with an UNK status
    
''' 


def extract_small_del_info(vcfreader):
    
    temp_del = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
    
    temp_BND = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                'SVTYPE': [], 'SVTYPE2': [], 'MATEID': [], 'QUAL': [], \
                'GT': [], 'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
   
    counter = 0
    
    for record in vcfreader:
        
        if str(record.INFO['SVTYPE']) == 'DEL':

            temp_del['CHROM'].append(record.CHROM)
            temp_del['POS'].append(record.POS)
            temp_del['ID'].append(record.ID)
            temp_del['REF'].append(record.REF)

            if len(record.ALT) != 1:
                print('ERROR')
                print('ALT: ' + str(record.ALT))

            temp_del['ALT'].append(str(record.ALT[0]))
            temp_del['SVTYPE'].append(str(record.INFO['SVTYPE']))
            temp_del['QUAL'].append(record.QUAL)
            temp_del['END'].append(record.INFO['END'])
            temp_del['SVLEN'].append(record.INFO['SVLEN'])
            temp_del['GT'].append(str(record.samples[0]['GT']))
            temp_del['HAP_ALLELIC_FRAC'].append(record.INFO['HAP_ALLELIC_FRAC'])
            temp_del['ALLELIC_FRAC'].append(record.INFO['ALLELIC_FRAC'])
            temp_del['FILTER'].append(record.FILTER)

        elif str(record.INFO['SVTYPE']) == 'BND':
            
            temp_BND['CHROM'].append(record.CHROM)
            temp_BND['POS'].append(record.POS)
            temp_BND['ID'].append(record.ID)
            temp_BND['REF'].append(record.REF)

            if len(record.ALT) != 1:
                print('ERROR')
                print('ALT: ' + str(record.ALT))

            temp_BND['ALT'].append(str(record.ALT[0]))
            temp_BND['SVTYPE'].append(str(record.INFO['SVTYPE']))
            temp_BND['SVTYPE2'].append(str(record.INFO['SVTYPE2']))
            temp_BND['MATEID'].append(str(record.INFO['MATEID']))                    
            temp_BND['QUAL'].append(record.QUAL)
            temp_BND['GT'].append(str(record.samples[0]['GT']))
            temp_BND['HAP_ALLELIC_FRAC'].append(record.INFO['HAP_ALLELIC_FRAC'])
            temp_BND['ALLELIC_FRAC'].append(record.INFO['ALLELIC_FRAC'])
            temp_BND['FILTER'].append(record.FILTER)
        
        else:
            print('ERROR')
            print(str(record.INFO['SVTYPE']))
        
        counter += 1
        
    return temp_del, temp_BND

In [9]:
### Function used on SV dictionaries to pick out SVs tagged with Long Ranger's built-in filters ###

## len(dict['FILTER'][i]) == 0 indicates that there are no tag on the SV
## SVs without any tag are SVs that passed all built-in filters from Long Ranger

def split_sv_by_filter(sample_dict):
        
    sample_filter_pass = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                          'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                          'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}

    sample_filter_fail = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                          'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                          'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}

    for i in range(len(sample_dict['CHROM'])):
        
        if len(sample_dict['FILTER'][i]) == 0:
            
            sample_filter_pass['CHROM'].append(sample_dict['CHROM'][i])
            sample_filter_pass['POS'].append(sample_dict['POS'][i])
            sample_filter_pass['ID'].append(sample_dict['ID'][i])
            sample_filter_pass['REF'].append(sample_dict['REF'][i])
            sample_filter_pass['ALT'].append(sample_dict['ALT'][i])
            sample_filter_pass['SVTYPE'].append(sample_dict['SVTYPE'][i])
            sample_filter_pass['QUAL'].append(sample_dict['QUAL'][i])
            sample_filter_pass['END'].append(sample_dict['END'][i])
            sample_filter_pass['SVLEN'].append(sample_dict['SVLEN'][i])
            sample_filter_pass['GT'].append(sample_dict['GT'][i])
            sample_filter_pass['HAP_ALLELIC_FRAC'].append(sample_dict['HAP_ALLELIC_FRAC'][i])
            sample_filter_pass['ALLELIC_FRAC'].append(sample_dict['ALLELIC_FRAC'][i])
            sample_filter_pass['FILTER'].append(sample_dict['FILTER'][i])

        else:

            sample_filter_fail['CHROM'].append(sample_dict['CHROM'][i])
            sample_filter_fail['POS'].append(sample_dict['POS'][i])
            sample_filter_fail['ID'].append(sample_dict['ID'][i])
            sample_filter_fail['REF'].append(sample_dict['REF'][i])
            sample_filter_fail['ALT'].append(sample_dict['ALT'][i])
            sample_filter_fail['SVTYPE'].append(sample_dict['SVTYPE'][i])
            sample_filter_fail['QUAL'].append(sample_dict['QUAL'][i])
            sample_filter_fail['END'].append(sample_dict['END'][i])
            sample_filter_fail['SVLEN'].append(sample_dict['SVLEN'][i])
            sample_filter_fail['GT'].append(sample_dict['GT'][i])
            sample_filter_fail['HAP_ALLELIC_FRAC'].append(sample_dict['HAP_ALLELIC_FRAC'][i])
            sample_filter_fail['ALLELIC_FRAC'].append(sample_dict['ALLELIC_FRAC'][i])
            sample_filter_fail['FILTER'].append(sample_dict['FILTER'][i])
            

    return sample_filter_pass, sample_filter_fail

In [14]:
### Function to reconcile BNDs on the same chromosome and with only 1 MATEID to their appropriate SV types ###

def combine_BNDs_on_same_chr(BND_dict):
    
    output_dict = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                   'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                   'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}

    temp_df = pd.DataFrame.from_dict(BND_dict)
    
    for ind, row in temp_df.iterrows():
        
        # If the call ID is the first half of the BND pairs
        if row['ID'][-1] == '1':
            
            # Note that the MATEID column is encoded in the format of ['call_3160_2'], thus the [2:-2]
            temp_row = temp_df[temp_df['ID'] == row['MATEID'][2:-2]]

            if len(temp_row) == 0:
                pass
            
            elif row['SVTYPE2'] == 'BND':
                pass
            
            # If the second half of the pair is on the same chromosome
            elif temp_row['CHROM'].tolist()[0] == row['CHROM']:
                
                output_dict['CHROM'].append(row['CHROM'])
                output_dict['POS'].append(row['POS'])
                output_dict['ID'].append(row['ID'][:-2])
                output_dict['REF'].append(row['REF'])
                output_dict['ALT'].append('<' + row['SVTYPE2'] + '>')
                output_dict['SVTYPE'].append(row['SVTYPE2'])
                output_dict['QUAL'].append(row['QUAL'])
                output_dict['END'].append(temp_row['POS'].tolist()[0])
                
                if row['SVTYPE2'] == 'DEL':
                    output_dict['SVLEN'].append(row['POS'] - temp_row['POS'].tolist()[0])
                else:
                    output_dict['SVLEN'].append(temp_row['POS'].tolist()[0] - row['POS'])
                
                output_dict['GT'].append(row['GT'])
                output_dict['HAP_ALLELIC_FRAC'].append(row['HAP_ALLELIC_FRAC'])
                output_dict['ALLELIC_FRAC'].append(row['ALLELIC_FRAC'])
                output_dict['FILTER'].append(temp_row['FILTER'].tolist()[0] + row['FILTER'])

    return output_dict

In [10]:
### Utility function for sorting of variable IDs into bins with corresponding patient IDs ###

def sort_sample_by_patient_and_type(patients, var_name_list):
    
    sample_dict = {}

    for p in patients:

        sample_dict[p] = {'NORM': [], 'TUMOUR': []}
        p_sample = [i for i in var_name_list if i.startswith(p)]

        for s in p_sample:

            if 'norm' in s:

                sample_dict[p]['NORM'].append(s)

            else: 

                sample_dict[p]['TUMOUR'].append(s)
                
    return sample_dict

In [None]:
### Function to split SV dictionaries into type-specific dictionaries ###

def split_DEL_INV_DUP(combined_svs):
    
    temp_DEL = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
    
    temp_INV = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
    
    temp_DUP = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': []}
    
    for i in range(len(combined_svs['CHROM'])):
        
        if combined_svs['SVTYPE'][i] == 'DEL':
        
            temp_DEL['CHROM'].append(combined_svs['CHROM'][i])
            temp_DEL['POS'].append(combined_svs['POS'][i])
            temp_DEL['ID'].append(combined_svs['ID'][i])
            temp_DEL['REF'].append(combined_svs['REF'][i])
            temp_DEL['ALT'].append(combined_svs['ALT'][i])
            temp_DEL['SVTYPE'].append(combined_svs['SVTYPE'][i])
            temp_DEL['QUAL'].append(combined_svs['QUAL'][i])
            temp_DEL['END'].append(combined_svs['END'][i])
            temp_DEL['SVLEN'].append(combined_svs['SVLEN'][i])
            temp_DEL['GT'].append(combined_svs['GT'][i])
            temp_DEL['HAP_ALLELIC_FRAC'].append(combined_svs['HAP_ALLELIC_FRAC'][i])
            temp_DEL['ALLELIC_FRAC'].append(combined_svs['ALLELIC_FRAC'][i])
            temp_DEL['FILTER'].append(combined_svs['FILTER'][i])
        
        if combined_svs['SVTYPE'][i] == 'INV':
        
            temp_INV['CHROM'].append(combined_svs['CHROM'][i])
            temp_INV['POS'].append(combined_svs['POS'][i])
            temp_INV['ID'].append(combined_svs['ID'][i])
            temp_INV['REF'].append(combined_svs['REF'][i])
            temp_INV['ALT'].append(combined_svs['ALT'][i])
            temp_INV['SVTYPE'].append(combined_svs['SVTYPE'][i])
            temp_INV['QUAL'].append(combined_svs['QUAL'][i])
            temp_INV['END'].append(combined_svs['END'][i])
            temp_INV['SVLEN'].append(combined_svs['SVLEN'][i])
            temp_INV['GT'].append(combined_svs['GT'][i])
            temp_INV['HAP_ALLELIC_FRAC'].append(combined_svs['HAP_ALLELIC_FRAC'][i])
            temp_INV['ALLELIC_FRAC'].append(combined_svs['ALLELIC_FRAC'][i])
            temp_INV['FILTER'].append(combined_svs['FILTER'][i])
        
        if combined_svs['SVTYPE'][i] == 'DUP':
        
            temp_DUP['CHROM'].append(combined_svs['CHROM'][i])
            temp_DUP['POS'].append(combined_svs['POS'][i])
            temp_DUP['ID'].append(combined_svs['ID'][i])
            temp_DUP['REF'].append(combined_svs['REF'][i])
            temp_DUP['ALT'].append(combined_svs['ALT'][i])
            temp_DUP['SVTYPE'].append(combined_svs['SVTYPE'][i])
            temp_DUP['QUAL'].append(combined_svs['QUAL'][i])
            temp_DUP['END'].append(combined_svs['END'][i])
            temp_DUP['SVLEN'].append(combined_svs['SVLEN'][i])
            temp_DUP['GT'].append(combined_svs['GT'][i])
            temp_DUP['HAP_ALLELIC_FRAC'].append(combined_svs['HAP_ALLELIC_FRAC'][i])
            temp_DUP['ALLELIC_FRAC'].append(combined_svs['ALLELIC_FRAC'][i])
            temp_DUP['FILTER'].append(combined_svs['FILTER'][i])
        
    return temp_DEL, temp_INV, temp_DUP

In [15]:
### Utility function given an array and a value ###
### find the elements in the array that are the closest to the value ###
### returns the element and the array of indices of the elements ###

def find_nearest(array, value):

    array = np.asarray(array)
    dif_array = np.abs(array - value)
    idx_array = np.where(dif_array == dif_array.min())[0]
    
    return array[idx_array[0]], idx_array

In [31]:
find_nearest([1, 2, 6, 4, 6], 5)

(6, array([2, 3, 4]))

In [30]:
find_nearest([1, 2, 6, 5, 4, 6], 5)

(5, array([3]))

In [28]:
### Function that computes the distance of each tumour SV to their closest counterparts in the normal sample ###

### Note that the function only focuses on the starting position (POS) to look for the closest counterpart ###

'''
Input: 

1. sv_normal_dict: dictionary of SVs from the normal sample
2. sv_tumour_dict: dictionary of SVs from the tumour sample

Outputs:

output_dict = {'NEAREST_POS':[], 'NEAREST_POS_DIST':[], \
               'NEAREST_END':[], 'NEAREST_END_DIST':[], \
               'NEAREST_TOTAL_DIST':[], 'NEAREST_NORMAL_IDX':[]}
- NEAREST_POS: where on the chromosome the nearest POS (start of SV) is in the normal sample
- NEAREST_POS_DIST: the absolute difference between the tumour SV POS and the nearest normal SV POS
- NEAREST_END: where on the chromosome the nearest END (end of SV) is in the normal sample
- NEAREST_END_DIST: the absolute difference between the tumour SV END and the nearest normal SV END
- NEAREST_TOTAL_DIST: NEAREST_POS_DIST + NEAREST_END_DIST
- NEAREST_NORMAL_IDX: index of the nearest SV from the normal dictionary

'''

def get_sv_distance_by_start(sv_normal_dict, sv_tumour_dict):
    
    output_dict = {'NEAREST_POS':[], 'NEAREST_POS_DIST':[], \
                   'NEAREST_END':[], 'NEAREST_END_DIST':[], \
                   'NEAREST_TOTAL_DIST':[], 'NEAREST_NORMAL_IDX':[]}
    
    for i in range(len(sv_tumour_dict['CHROM'])):
        
        # Get index array for elements with the same chromosome
        temp_normal_idx_array_w_chrom = \
        np.where(np.array(sv_normal_dict['CHROM']) == np.array(sv_tumour_dict['CHROM'][i]))
        
        if temp_normal_idx_array_w_chrom[0].shape[0] == 0:
            
            output_dict['NEAREST_POS'].append('N/A')
            output_dict['NEAREST_POS_DIST'].append('N/A')
            output_dict['NEAREST_END'].append('N/A')
            output_dict['NEAREST_END_DIST'].append('N/A')
            output_dict['NEAREST_TOTAL_DIST'].append('N/A')
            output_dict['NEAREST_NORMAL_IDX'].append('N/A')

        else:
            
            
            # Find POS with smallest distance
            
            temp_normal_pos_array_w_chrom = np.array(sv_normal_dict['POS'])[temp_normal_idx_array_w_chrom]
            
            temp_closest_pos, temp_idx_array = find_nearest(temp_normal_pos_array_w_chrom, sv_tumour_dict['POS'][i])
            
            
            # Find END with smallest distance (subset of earlier indices)
            
            temp_normal_end_array_w_chrom = np.array(sv_normal_dict['END'])[temp_normal_idx_array_w_chrom]
            
            temp_closest_end, temp_idx_end = find_nearest(temp_normal_end_array_w_chrom[temp_idx_array], \
                                                          sv_tumour_dict['END'][i])

            temp_normal_idx = temp_normal_idx_array_w_chrom[0][temp_idx_array[temp_idx_end[0]]]
            
            
            # Debug
            
            if abs(temp_closest_pos - sv_tumour_dict['POS'][i]) != \
            abs(sv_normal_dict['POS'][temp_normal_idx] - sv_tumour_dict['POS'][i]):
                
                print('POS')
                print(i)
                print('CHROM: ' + sv_normal_dict['CHROM'][temp_normal_idx])
                print('POS: ' + str(sv_normal_dict['POS'][temp_normal_idx]))
                print('END: ' + str(sv_normal_dict['END'][temp_normal_idx]))
                print(temp_normal_idx)
                print('ERROR')

            if abs(temp_closest_end - sv_tumour_dict['END'][i]) != \
            abs(sv_normal_dict['END'][temp_normal_idx] - sv_tumour_dict['END'][i]):
                
                print('END')
                print(i)
                print('CHROM: ' + sv_normal_dict['CHROM'][temp_normal_idx])
                print('POS: ' + str(sv_normal_dict['POS'][temp_normal_idx]))
                print('END: ' + str(sv_normal_dict['END'][temp_normal_idx]))
                print('normal_idx: ' + temp_normal_idx)
                print('ERROR')

            output_dict['NEAREST_POS'].append(temp_closest_pos)
            output_dict['NEAREST_POS_DIST'].append(abs(temp_closest_pos - sv_tumour_dict['POS'][i]))
            output_dict['NEAREST_END'].append(temp_closest_end)
            output_dict['NEAREST_END_DIST'].append(abs(temp_closest_end - sv_tumour_dict['END'][i]))
            output_dict['NEAREST_TOTAL_DIST'].append(abs(temp_closest_pos - sv_tumour_dict['POS'][i]) + \
                                                    abs(temp_closest_end - sv_tumour_dict['END'][i]))
            output_dict['NEAREST_NORMAL_IDX'].append(temp_normal_idx)
            
    return output_dict

In [15]:
### Function that splits the tumour SVs into somatic SVs and germline SVs using get_sv_distance_by_start ###

def somatic_sv_call_by_nearest_normal(normal_filter_pass, sample_filter_pass, dist_cutoff):
    
    somatic_dict = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                    'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                    'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': [], 'DIST': []}
    
    germline_dict = {'CHROM': [], 'POS': [], 'ID': [], 'REF': [], 'ALT': [], \
                     'SVTYPE': [], 'QUAL': [], 'END': [], 'SVLEN': [], 'GT': [], \
                     'HAP_ALLELIC_FRAC': [], 'ALLELIC_FRAC': [], 'FILTER': [], 'DIST': []}
    
    dist_dict = get_sv_distance_by_start(normal_filter_pass, sample_filter_pass)
    
    if len(dist_dict['NEAREST_TOTAL_DIST']) != len(sample_filter_pass['CHROM']):
        
        print('ERROR, SKIPPING SAMPLES WHEN CALCULATING DISTANCES.')
    
    for i in range(len(dist_dict['NEAREST_POS'])):
                
        if dist_dict['NEAREST_TOTAL_DIST'][i] == 'N/A':
            
            somatic_dict['CHROM'].append(sample_filter_pass['CHROM'][i])
            somatic_dict['POS'].append(sample_filter_pass['POS'][i])
            somatic_dict['ID'].append(sample_filter_pass['ID'][i])
            somatic_dict['REF'].append(sample_filter_pass['REF'][i])
            somatic_dict['ALT'].append(sample_filter_pass['ALT'][i])
            somatic_dict['SVTYPE'].append(sample_filter_pass['SVTYPE'][i])
            somatic_dict['QUAL'].append(sample_filter_pass['QUAL'][i])
            somatic_dict['END'].append(sample_filter_pass['END'][i])
            somatic_dict['SVLEN'].append(sample_filter_pass['SVLEN'][i])
            somatic_dict['GT'].append(sample_filter_pass['GT'][i])
            somatic_dict['HAP_ALLELIC_FRAC'].append(sample_filter_pass['HAP_ALLELIC_FRAC'][i])
            somatic_dict['ALLELIC_FRAC'].append(sample_filter_pass['ALLELIC_FRAC'][i])
            somatic_dict['FILTER'].append(sample_filter_pass['FILTER'][i])
            somatic_dict['DIST'].append('N/A')

        elif dist_dict['NEAREST_TOTAL_DIST'][i] > dist_cutoff:
            
            somatic_dict['CHROM'].append(sample_filter_pass['CHROM'][i])
            somatic_dict['POS'].append(sample_filter_pass['POS'][i])
            somatic_dict['ID'].append(sample_filter_pass['ID'][i])
            somatic_dict['REF'].append(sample_filter_pass['REF'][i])
            somatic_dict['ALT'].append(sample_filter_pass['ALT'][i])
            somatic_dict['SVTYPE'].append(sample_filter_pass['SVTYPE'][i])
            somatic_dict['QUAL'].append(sample_filter_pass['QUAL'][i])
            somatic_dict['END'].append(sample_filter_pass['END'][i])
            somatic_dict['SVLEN'].append(sample_filter_pass['SVLEN'][i])
            somatic_dict['GT'].append(sample_filter_pass['GT'][i])
            somatic_dict['HAP_ALLELIC_FRAC'].append(sample_filter_pass['HAP_ALLELIC_FRAC'][i])
            somatic_dict['ALLELIC_FRAC'].append(sample_filter_pass['ALLELIC_FRAC'][i])
            somatic_dict['FILTER'].append(sample_filter_pass['FILTER'][i])
            somatic_dict['DIST'].append(dist_dict['NEAREST_TOTAL_DIST'][i])

        else:
            
            germline_dict['CHROM'].append(sample_filter_pass['CHROM'][i])
            germline_dict['POS'].append(sample_filter_pass['POS'][i])
            germline_dict['ID'].append(sample_filter_pass['ID'][i])
            germline_dict['REF'].append(sample_filter_pass['REF'][i])
            germline_dict['ALT'].append(sample_filter_pass['ALT'][i])
            germline_dict['SVTYPE'].append(sample_filter_pass['SVTYPE'][i])
            germline_dict['QUAL'].append(sample_filter_pass['QUAL'][i])
            germline_dict['END'].append(sample_filter_pass['END'][i])
            germline_dict['SVLEN'].append(sample_filter_pass['SVLEN'][i])
            germline_dict['GT'].append(sample_filter_pass['GT'][i])
            germline_dict['HAP_ALLELIC_FRAC'].append(sample_filter_pass['HAP_ALLELIC_FRAC'][i])
            germline_dict['ALLELIC_FRAC'].append(sample_filter_pass['ALLELIC_FRAC'][i])
            germline_dict['FILTER'].append(sample_filter_pass['FILTER'][i])
            germline_dict['DIST'].append(dist_dict['NEAREST_TOTAL_DIST'][i])

    return somatic_dict, germline_dict, dist_dict