In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
import akita_utils_forplotting as utils
import numpy as np
import pandas as pd
import sys
import os, psutil, io
import gzip
import time 

from optparse import OptionParser


nt = ['A', 'T', 'C', 'G']

In [39]:
def read_vcf(path):
    
    '''
    Read  vcf files into dataframe.
    Adapted from: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744.
    
    '''
    
    with open(path, 'r') as f:
        
        lines =[l for l in f if not l.startswith('#')]
        
    vcf_file = io.StringIO(''.join(lines))
    
    return vcf_file



def read_vcf_gz(path):
    
    '''
    Read  gzipped vcf files into dataframe.
    Adapted from: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744.
    
    '''
    
    with io.TextIOWrapper(gzip.open(path,'r')) as f:

        lines =[l for l in f if not l.startswith('#')]
        
    vcf_file = io.StringIO(''.join(lines))
    
    return vcf_file

def read_vcf_gz(path):
    
    '''
    Read  gzipped vcf files into dataframe.
    Adapted from: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744.
    
    '''
    
    with io.TextIOWrapper(gzip.open(path,'r')) as f:

        lines =[l for l in f if not l.startswith('#')]
        
    vcf_file = io.StringIO(''.join(lines))
    
    return vcf_file


var_set_size=10000
def read_input(in_file, var_set):

    '''
    Read and reformat variant dataset. Accepted formats are .vcf .vcf.gz from 4.1 version, 
    .bed file with the following columns: [CHROM, POS, REF, ALT, END, SVTYPE, SVLEN], 
    and .tsv from ANNOVAR annotSV.
    
    '''
    
    
    if 'vcf' in in_file:
        
        # For gzipped files
        if in_file.endswith('.gz'):
            vcf_file = read_vcf_gz(in_file)
        else:
            vcf_file = read_vcf(in_file)
            
            
        variants = pd.read_csv(
                vcf_file,
                skiprows = var_set*var_set_size, nrows = var_set_size,
                sep='\t',
                names = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'INFO2', 'INFO3', 'INFO4']
            )
            
        # Read SVs
        if any(['SVTYPE' in x for x in variants.INFO]):
            
            if any(['END' in x and 'SVLEN' in x for x in variants.INFO]): # BNDs don't have 'END'
                variants['END'] = variants.INFO.str.split('END=').str[1].str.split(';').str[0] # this SVLEN (END-POS) would be 0 for SNPs
                variants.loc[~pd.isnull(variants.END), 'END'] = variants.loc[~pd.isnull(variants.END), 'END'].astype('int')
                variants['SVLEN'] = variants.INFO.str.split('SVLEN=').str[1].str.split(';').str[0]
            else:
                variants['END'] = [np.nan]*len(variants)
                variants['SVLEN'] = [np.nan]*len(variants)
            variants['SVTYPE'] = variants.INFO.str.split('SVTYPE=').str[1].str.split(';').str[0]
            
            variants = variants[['CHROM', 'POS', 'END', 'REF', 'ALT', 'SVTYPE', 'SVLEN']]

        # Read simple variants 
        else:
            
            variants = variants[['CHROM', 'POS', 'REF', 'ALT']]       
            
 
        
    elif 'bed' in in_file:
        
        colnames = ['CHROM', 'POS', 'REF', 'ALT', 'END', 'SVTYPE', 'SVLEN']
        ncols = len(pd.read_csv(in_file, sep = '\t', nrows = 0, low_memory=False).columns)

        variants = pd.read_csv(in_file, sep = '\t', names = colnames[:ncols], low_memory=False,
                               skiprows = var_set*var_set_size, nrows = var_set_size)
        
        
    elif 'tsv' in in_file:
        
        with (gzip.open if in_file.endswith(".gz") else open)(in_file, "rt", encoding="utf-8") as variants:
            variants = (pd.read_csv(in_file, sep = '\t', low_memory=False,
                                   skiprows = var_set*var_set_size, nrows = var_set_size)
                        .rename(columns = {'SV_chrom':'CHROM', 
                                           'SV_start':'POS',
                                           'SV_end':'END', 
                                           'SV_type':'SVTYPE',
                                           'SV_length':'SVLEN'})
                       [['CHROM', 'POS', 'END', 'REF', 'ALT', 'SVTYPE', 'SVLEN']])
            variants['CHROM'] = ['chr' + str(x) for x in variants['CHROM']]
            variants.loc[~pd.isnull(variants.END), 'END'] = variants.loc[~pd.isnull(variants.END), 'END'].astype('int')

            
    else:
        raise ValueError('Input file type not accepted. Make sure it has the right extension.')
        
        
    variants.reset_index(inplace = True, drop = True)
    
    
    return variants




In [7]:
file_list

['c420e97a-0c92-4d86-a8f2-39977628649e.manta.PASS.vcf.gz',
 'c71f4771-d8e9-47bf-9715-0309e18a20b3.manta.PASS.vcf.gz.tbi',
 '37bd5c7e-0c8c-495c-afbd-9e59538c72c8.manta.PASS.vcf.gz',
 '3beb0d15-3a28-41b4-95ee-21e3c70b0304.manta.PASS.vcf.gz',
 'c4a1a12c-2afc-4c77-825e-aaa60ff4f5be.manta.PASS.vcf.gz',
 '6fadce13-ea16-4b47-9851-ab5d54b8a776.manta.PASS.vcf.gz',
 '923bd63c-efd5-40c0-aebd-3809d7578ad2.manta.PASS.vcf.gz',
 'f6db0e5c-56de-4ca0-9eab-888d75613c73.manta.PASS.vcf.gz',
 '694e5e67-db2a-44f9-abac-857ba93fc51e.manta.PASS.vcf.gz',
 'manifest_20230620_202949.csv',
 'c420e97a-0c92-4d86-a8f2-39977628649e.manta.PASS.vcf.gz.tbi',
 'a0848714-f663-4568-8e7f-7395536a3e38.manta.PASS.vcf.gz.tbi',
 'ddd9b416-b520-44e5-bcde-5967d9d74c94.manta.PASS.vcf.gz',
 'f86d79ef-10e7-4bff-8970-c254dba3d553.manta.PASS.vcf.gz.tbi',
 'a5a8fff5-5c63-4166-bbf9-6b8a21a1a616.manta.PASS.vcf.gz.tbi',
 '0890fcf2-11f4-4469-859f-eb8b94d791d3.manta.PASS.vcf.gz.tbi',
 'ae112689-45f1-4ffe-8b88-b80f1f14c45a.manta.PASS.vcf.gz.t

In [3]:
file_path='/pollard/data/projects/shzhang_rotation/somatic_variants/somatic_structural_variants/'

vcf_path= file_path+ '02dc6e6c-1baf-4f43-8360-d4950838a63a.somaticSV.vcf.gz'

# #input_dir=options.in_dir
input_dir='/pollard/home/shzhang/akita/data/somatic_SV/'


#vcf_name='02dc6e6c-1baf-4f43-8360-d4950838a63a.somaticSV.vcf.gz'
vcf_name='1cf7fb86-a0c1-42e1-aedd-7bcaa01ee808.somaticSV.vcf.gz'
vcf_path=input_dir + vcf_name
print(vcf_path)


/pollard/home/shzhang/akita/data/somatic_SV/1cf7fb86-a0c1-42e1-aedd-7bcaa01ee808.somaticSV.vcf.gz


In [71]:
from os.path import isfile, join

#idk having problems extracting SV ends and such here 
file_path='/pollard/data/projects/kgjoni/CBTN_collab/CBTN_data/structural-variants_230620/'
#file_path='/pollard/data/projects/shzhang_rotation/somatic_variants/somatic_structural_variants/'

file_list= [f for f in os.listdir(file_path) if isfile(join(file_path, f))]

In [58]:
dfs = []
columns_to_keep=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']

# Read each file and append its DataFrame to the list
for file in file_list:
    if file.endswith('.gz'):
        #df = read_input(file_path + file, 0)  
        df=utils.read_vcf(file_path+file, is_gzip=True)
        df = df[df.CHROM != 'chrM']
        df = df[df.FILTER == 'PASS']
        
        print('Sample {} has {} SVs'.format(file, str(len(df))))
        dfs.append(df[columns_to_keep])

# Concatenate all DataFrames into one large DataFrame
variants = pd.concat(dfs)

# If you want to reset the index of the combined DataFrame
variants = variants.reset_index(drop=True)

Sample 27f05b52-ce06-4c66-9954-259f182c1b9d.somaticSV.vcf.gz has 18 SVs
Sample 7c1a7deb-d63a-4631-8e12-e76a2989beb3.somaticSV.vcf.gz has 80 SVs
Sample 35c87b1a-eea4-4841-bed3-4c9591f5f417.somaticSV.vcf.gz has 329 SVs
Sample 2026b231-3964-42e0-a961-6e83660a2b5a.somaticSV.vcf.gz has 226 SVs
Sample 888a780c-8ec8-4e54-b1f6-d26945d49b76.somaticSV.vcf.gz has 40 SVs
Sample 3c553037-bbba-4e5a-a454-b75f2f94994c.somaticSV.vcf.gz has 298 SVs
Sample bed27ad0-69ac-4642-9171-11c91959a041.somaticSV.vcf.gz has 179 SVs
Sample 95ae222c-1827-4cad-90f7-bfaa73c8772c.somaticSV.vcf.gz has 386 SVs
Sample affb8185-5890-4c6d-b71c-301bc1669a7f.somaticSV.vcf.gz has 80 SVs
Sample c779efea-c4c5-41e7-93dd-c51d89085a04.somaticSV.vcf.gz has 85 SVs
Sample 2bfaddaa-8566-41e1-9ed2-0eff807aa55c.somaticSV.vcf.gz has 208 SVs
Sample 7fe54e61-fb14-44dc-aeb0-bb41affd842c.somaticSV.vcf.gz has 10 SVs
Sample 238bbfd6-f4f4-4385-9e40-37082cfbe72f.somaticSV.vcf.gz has 39 SVs
Sample 68a4d419-c9ef-4d77-80bb-00f205178a70.somaticSV.vcf.

In [59]:
#add END column
#becomes NaN for BND since they don't have end in INFO
variants['END']=variants['INFO'].str.extract('END=(.*?)\;')

#add SVTYPE column
variants['SVTYPE']=variants['INFO'].str.extract('SVTYPE=(.*?)\;')


#subset to non BND since BND are taken care of 
variants_bnd=variants[variants.SVTYPE == 'BND']
variants_bnd = variants_bnd.reset_index(drop=True)


variants_del=variants[variants.SVTYPE == 'DEL']
variants_del['END']=variants_del['END'].astype(int)
variants_del['SPAN']=abs(variants_del['END']-variants_del['POS'])
variants_del=variants_del[(variants_del.SPAN>1024) & (variants_del.SPAN<700e5)]
variants_del = variants_del.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variants_del['END']=variants_del['END'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variants_del['SPAN']=abs(variants_del['END']-variants_del['POS'])


In [56]:
variants_del

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,END,SVTYPE,SPAN
0,chr3,116093211,MantaDEL:1:57871:57957:0:0:0,A,<DEL>,.,PASS,"END=116099856;SVTYPE=DEL;SVLEN=-6645;CIPOS=0,1...",PR:SR,116099856,DEL,6645
1,chr3,116203886,MantaDEL:215599:0:1:0:0:0,C,<DEL>,.,PASS,END=116371856;SVTYPE=DEL;SVLEN=-167970;SVINSLE...,PR:SR,116371856,DEL,167970
2,chr3,116447397,MantaDEL:215627:0:1:0:0:0,G,<DEL>,.,PASS,END=116974644;SVTYPE=DEL;SVLEN=-527247;CIPOS=0...,PR:SR,116974644,DEL,527247
3,chr3,116994189,MantaDEL:215665:0:1:0:0:0,A,<DEL>,.,PASS,END=116998120;SVTYPE=DEL;SVLEN=-3931;SVINSLEN=...,PR:SR,116998120,DEL,3931
4,chr3,117095931,MantaDEL:215656:0:1:0:1:0,C,<DEL>,.,PASS,END=117149510;SVTYPE=DEL;SVLEN=-53579;SVINSLEN...,PR:SR,117149510,DEL,53579
...,...,...,...,...,...,...,...,...,...,...,...,...
496,chr8,112418944,MantaDEL:251474:0:1:0:0:0,G,<DEL>,.,PASS,END=112631910;SVTYPE=DEL;SVLEN=-212966;SOMATIC...,PR:SR,112631910,DEL,212966
497,chr13,18769972,MantaDEL:11925:0:1:0:0:0,T,<DEL>,.,PASS,END=18789193;SVTYPE=DEL;SVLEN=-19221;IMPRECISE...,PR,18789193,DEL,19221
498,chr6,32655602,MantaDEL:1:40534:40630:1:0:0,A,<DEL>,.,PASS,END=32754221;SVTYPE=DEL;SVLEN=-98619;IMPRECISE...,PR,32754221,DEL,98619
499,chr7,63472092,MantaDEL:1:36222:36263:0:0:0,G,<DEL>,.,PASS,END=63482181;SVTYPE=DEL;SVLEN=-10089;IMPRECISE...,PR,63482181,DEL,10089


In [61]:
#RUN FOR DELETIONS FIRST 

start=time.time()


# Run prediction on a loop
#for i in variants.index: 
for i in range(0,2):
    print(i)
    # Get info on variant
    #variant = variants.loc[i]
    variant=variants_del.loc[i]

    CHR = variant.CHROM
    POS = variant.POS
    REF = variant.REF
    ALT = variant.ALT
    SVTYPE = variant.SVTYPE
    END= (int(variant.END)) if SVTYPE!='BND' else np.nan

        
    print(variant.ID)
        
    # Get disruption scores
        
    #this usually occurs with deletions it seems
    
    try:
        
        if abs(END-POS)>7e5:
            print('Span is >700kb for SV with index ' + str(i))

        else:
            MSE, CORR, ref_strip = utils.get_scores_SV(CHR, POS, ALT, END, SVTYPE, shift='none', plot=False)

            
            
        variants.loc[i, 'MSE'] = MSE
        variants.loc[i, 'CORR'] = CORR
    except:
        print('error in index: ' + str(i))
        pass
        

#see how much memory for one vcf 
print('memory in MB: ' + str(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2))

end=time.time()
print('seconds_elapsed: ', str(end-start))




0
MantaDEL:1:57871:57957:0:0:0
getting SCORES
getting scores
masking matrices
DEL
error in index: 0
1
MantaDEL:215599:0:1:0:0:0
getting SCORES
getting scores


  MSE = np.nansum([x**2 for x in sub_vec])/non_nan_values


masking matrices
DEL
error in index: 1
memory in MB: 1022.05078125
seconds_elapsed:  8.445969820022583


In [None]:
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_rows', None)



In [69]:
#RUN FOR DELETIONS FIRST 

start=time.time()


# Run prediction on a loop
#for i in variants.index: 
for i in range(1,3):
    print(i)
    # Get info on variant
    #variant = variants.loc[i]
    variant=variants_del.loc[i]

    CHR = variant.CHROM
    POS = variant.POS
    REF = variant.REF
    ALT = variant.ALT
    SVTYPE = variant.SVTYPE
    END= (int(variant.END)) if SVTYPE!='BND' else np.nan

        
    print(variant.ID)
        
    # Get disruption scores
        
    #this usually occurs with deletions it seems
    
    



    MSE, CORR, ref_strip, alt_strip = utils.get_scores_SV(CHR, POS, ALT, END, SVTYPE, shift='none', plot=False)

            
            
    variants.loc[i, 'MSE'] = MSE
    variants.loc[i, 'CORR'] = CORR


#see how much memory for one vcf 
print('memory in MB: ' + str(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2))

end=time.time()
print('seconds_elapsed: ', str(end-start))




Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sequence (InputLayer)          [(None, 1048576, 4)  0           []                               
                                ]                                                                 
                                                                                                  
 stochastic_reverse_complement_  ((None, 1048576, 4)  0          ['sequence[0][0]']               
 7 (StochasticReverseComplement  , ())                                                            
 )                                                                                                
                                                                                                  
 stochastic_shift_7 (Stochastic  (None, 1048576, 4)  0           ['stochastic_reverse_compl

 re_lu_301 (ReLU)               (None, 8192, 96)     0           ['max_pooling1d_83[0][0]']       
                                                                                                  
 conv1d_203 (Conv1D)            (None, 8192, 96)     46080       ['re_lu_301[0][0]']              
                                                                                                  
 batch_normalization_294 (Batch  (None, 8192, 96)    384         ['conv1d_203[0][0]']             
 Normalization)                                                                                   
                                                                                                  
 max_pooling1d_84 (MaxPooling1D  (None, 4096, 96)    0           ['batch_normalization_294[0][0]']
 )                                                                                                
                                                                                                  
 re_lu_302

 Normalization)                                                                                   
                                                                                                  
 re_lu_310 (ReLU)               (None, 512, 48)      0           ['batch_normalization_302[0][0]']
                                                                                                  
 conv1d_212 (Conv1D)            (None, 512, 96)      4608        ['re_lu_310[0][0]']              
                                                                                                  
 batch_normalization_303 (Batch  (None, 512, 96)     384         ['conv1d_212[0][0]']             
 Normalization)                                                                                   
                                                                                                  
 dropout_100 (Dropout)          (None, 512, 96)      0           ['batch_normalization_303[0][0]']
          

 Normalization)                                                                                   
                                                                                                  
 dropout_104 (Dropout)          (None, 512, 96)      0           ['batch_normalization_311[0][0]']
                                                                                                  
 add_104 (Add)                  (None, 512, 96)      0           ['add_103[0][0]',                
                                                                  'dropout_104[0][0]']            
                                                                                                  
 re_lu_319 (ReLU)               (None, 512, 96)      0           ['add_104[0][0]']                
                                                                                                  
 conv1d_221 (Conv1D)            (None, 512, 48)      13824       ['re_lu_319[0][0]']              
          

 conv2d_94 (Conv2D)             (None, 512, 512, 24  10368       ['re_lu_326[0][0]']              
                                )                                                                 
                                                                                                  
 batch_normalization_318 (Batch  (None, 512, 512, 24  96         ['conv2d_94[0][0]']              
 Normalization)                 )                                                                 
                                                                                                  
 re_lu_327 (ReLU)               (None, 512, 512, 24  0           ['batch_normalization_318[0][0]']
                                )                                                                 
                                                                                                  
 conv2d_95 (Conv2D)             (None, 512, 512, 48  1152        ['re_lu_327[0][0]']              
          

                                                                                                  
 batch_normalization_324 (Batch  (None, 512, 512, 24  96         ['conv2d_100[0][0]']             
 Normalization)                 )                                                                 
                                                                                                  
 re_lu_333 (ReLU)               (None, 512, 512, 24  0           ['batch_normalization_324[0][0]']
                                )                                                                 
                                                                                                  
 conv2d_101 (Conv2D)            (None, 512, 512, 48  1152        ['re_lu_333[0][0]']              
                                )                                                                 
                                                                                                  
 batch_nor



masking matrices
DEL
2
MantaDEL:215627:0:1:0:0:0
getting SCORES
getting scores


  MSE = np.nansum([x**2 for x in sub_vec])/non_nan_values


masking matrices
DEL
memory in MB: 1648.2265625
seconds_elapsed:  7.178581237792969


In [70]:
ref_strip

[]