In [1]:
# import necessary libraries for the analysis
import vcf
import pysam
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import itertools
import ast
from natsort import index_natsorted

In [2]:
## Functions used below are imported from another notebook, run before changing directory

%run filtering_for_somatic_SVs_functions.ipynb
%run overlap_and_range_functions.ipynb

In [3]:
### Load filtered somatic and germline small deletions ###

##
somatic_small_del_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel/small_dels'

somatic_small_del_panel_df_names = []

os.chdir(somatic_small_del_path)
temp_files = sorted([i for i in os.listdir(somatic_small_del_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    somatic_small_del_panel_df_names.append(file_name[:-4] + '_panel')

##
germline_small_del_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/germline_panel/small_dels'

germline_small_del_panel_df_names = []

os.chdir(germline_small_del_path)
temp_files = sorted([i for i in os.listdir(germline_small_del_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    germline_small_del_panel_df_names.append(file_name[:-4] + '_panel')

In [4]:
### Load filtered somatic and germline large SVs ###

## DELs
somatic_large_DEL_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel/large_svs/DEL'

somatic_large_DEL_panel_df_names = []

os.chdir(somatic_large_DEL_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DEL_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    somatic_large_DEL_panel_df_names.append(file_name[:-4] + '_panel')


## 
germline_large_DEL_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/germline_panel/large_svs/DEL'

germline_large_DEL_panel_df_names = []

os.chdir(germline_large_DEL_path)
temp_files = sorted([i for i in os.listdir(germline_large_DEL_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    germline_large_DEL_panel_df_names.append(file_name[:-4] + '_panel')

    
## DUPs
somatic_large_DUP_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel/large_svs/DUP'

somatic_large_DUP_panel_df_names = []

os.chdir(somatic_large_DUP_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DUP_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    somatic_large_DUP_panel_df_names.append(file_name[:-4] + '_panel')


## 
germline_large_DUP_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/germline_panel/large_svs/DUP'

germline_large_DUP_panel_df_names = []

os.chdir(germline_large_DUP_path)
temp_files = sorted([i for i in os.listdir(germline_large_DUP_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    germline_large_DUP_panel_df_names.append(file_name[:-4] + '_panel')


## INVs
somatic_large_INV_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel/large_svs/INV'

somatic_large_INV_panel_df_names = []

os.chdir(somatic_large_INV_path)
temp_files = sorted([i for i in os.listdir(somatic_large_INV_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    somatic_large_INV_panel_df_names.append(file_name[:-4] + '_panel')


## 
germline_large_INV_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/germline_panel/large_svs/INV'

germline_large_INV_panel_df_names = []

os.chdir(germline_large_INV_path)
temp_files = sorted([i for i in os.listdir(germline_large_INV_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4] + '_panel'] = pd.read_csv(file_name)
    germline_large_INV_panel_df_names.append(file_name[:-4] + '_panel')

In [None]:
'''
VARIABLE NAMES

somatic_small_del_panel_df_names
germline_small_del_panel_df_names

somatic_large_DEL_panel_df_names
germline_large_DEL_paneld_df_names

somatic_large_DUP_panel_df_names
germline_large_DUP_panel_df_names

somatic_large_INV_panel_df_names
germline_large_INV_panel_df_names
'''

# Load in hg38 gap track then filter out all SVs overlapping the gap regions

In [5]:
gap_file_path = '/Users/ryanyutian/Desktop/annotation_dataset/gap.txt'

gap_df = pd.read_csv(gap_file_path, sep='\t', names=['bin', 'chr', 'start', 'end', 'ix', 'n', 'size', 'type', 'bridge'])

In [6]:
gap_df

Unnamed: 0,bin,chr,start,end,ix,n,size,type,bridge
0,585,chr1,0,10000,1,N,10000,telomere,no
1,586,chr1,207666,257666,5,N,50000,contig,no
2,587,chr1,297968,347968,7,N,50000,contig,no
3,589,chr1,535988,585988,10,N,50000,contig,no
4,605,chr1,2702781,2746290,48,N,43509,scaffold,yes
...,...,...,...,...,...,...,...,...,...
822,585,chr3_KN196476v1_fix,34453,35012,8,N,559,scaffold,yes
823,585,chr3_KN196476v1_fix,43347,43496,10,N,149,scaffold,yes
824,593,chr15_KN538374v1_fix,1076278,1079711,9,N,3433,scaffold,yes
825,619,chr15_KN538374v1_fix,4501706,4535264,31,N,33558,scaffold,yes


In [7]:
### Somatic small deletions

somatic_small_del_panel_wo_gap_df_names = []

for df_name in somatic_small_del_panel_df_names:
    
    temp_df = globals()[df_name].copy()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap_df = overlap_func_wo_typeandid(row['CHROM'], row['POS'], row['END'], index, gap_df)
        
        if len(temp_overlap_df) != 0:
            
            temp_df = temp_df.drop(index)
    
    temp_df = temp_df.reset_index(drop=True)  
    temp_new_df_name = df_name + '_wo_gap'
    globals()[temp_new_df_name] = temp_df.copy()

    somatic_small_del_panel_wo_gap_df_names.append(temp_new_df_name)

In [8]:
### Somatic large deletions

somatic_large_DEL_panel_wo_gap_df_names = []

for df_name in somatic_large_DEL_panel_df_names:
    
    temp_df = globals()[df_name].copy()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap_df = overlap_func_wo_typeandid(row['CHROM'], row['POS'], row['END'], index, gap_df)
        
        if len(temp_overlap_df) != 0:

            temp_df = temp_df.drop(index)
    
    temp_df = temp_df.reset_index(drop=True)  
    temp_new_df_name = df_name + '_wo_gap'
    globals()[temp_new_df_name] = temp_df.copy()

    somatic_large_DEL_panel_wo_gap_df_names.append(temp_new_df_name)

In [9]:
### Somatic large duplications

somatic_large_DUP_panel_wo_gap_df_names = []

for df_name in somatic_large_DUP_panel_df_names:
    
    temp_df = globals()[df_name].copy()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap_df = overlap_func_wo_typeandid(row['CHROM'], row['POS'], row['END'], index, gap_df)
        
        if len(temp_overlap_df) != 0:

            temp_df = temp_df.drop(index)
    
    temp_df = temp_df.reset_index(drop=True)  
    temp_new_df_name = df_name + '_wo_gap'
    globals()[temp_new_df_name] = temp_df.copy()

    somatic_large_DUP_panel_wo_gap_df_names.append(temp_new_df_name)

In [10]:
### Somatic large inversions

somatic_large_INV_panel_wo_gap_df_names = []

for df_name in somatic_large_INV_panel_df_names:
    
    temp_df = globals()[df_name].copy()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap_df = overlap_func_wo_typeandid(row['CHROM'], row['POS'], row['END'], index, gap_df)
        
        if len(temp_overlap_df) != 0:

            temp_df = temp_df.drop(index)
    
    temp_df = temp_df.reset_index(drop=True)  
    temp_new_df_name = df_name + '_wo_gap'
    globals()[temp_new_df_name] = temp_df.copy()

    somatic_large_INV_panel_wo_gap_df_names.append(temp_new_df_name)

In [11]:
### Validation: small deletions

print('SMALL DELETIONS')

for df_name in somatic_small_del_panel_df_names:
    
    temp_df1 = globals()[df_name]
    temp_df2 = globals()[df_name + '_wo_gap']
    
    if len(temp_df1) != len(temp_df2):
        print(df_name[:-19])
        print('Difference = ' + str(len(temp_df1) - len(temp_df2)))
        print('New count = ' + str(len(temp_df2)))

SMALL DELETIONS
A_RR_GBM809
Difference = 7
New count = 607
A_R_GBM607
Difference = 2
New count = 523
B_P_GBM593
Difference = 6
New count = 678
B_R_GBM898
Difference = 5
New count = 593
C_P_GBM577
Difference = 2
New count = 812
C_R_GBM625
Difference = 3
New count = 647
E_RR_GBM937
Difference = 4
New count = 549
E_R_GBM781
Difference = 5
New count = 742
F_P_GBM620
Difference = 3
New count = 714
F_R_GBM691
Difference = 3
New count = 756
G_P_GBM454
Difference = 1
New count = 577
G_R_GBM833
Difference = 6
New count = 607
H_P_GBM460
Difference = 2
New count = 597
H_R_GBM492
Difference = 1
New count = 654
I_P_GBM440
Difference = 4
New count = 878
I_R_GBM532
Difference = 3
New count = 664
J_P_GBM401
Difference = 5
New count = 711
J_RR_GBM551
Difference = 4
New count = 718
J_R_GBM498
Difference = 2
New count = 745
K_P_GBM529
Difference = 3
New count = 837
K_R_GBM832
Difference = 3
New count = 750
L_P_GBM618
Difference = 5
New count = 667
L_R_SMTB152
Difference = 3
New count = 762
M_P_GBM672
Dif

In [12]:
### Validation: large deletions

print('LARGE DELETIONS')

for df_name in somatic_large_DEL_panel_df_names:
    
    temp_df1 = globals()[df_name]
    temp_df2 = globals()[df_name + '_wo_gap']
    
    if len(temp_df1) != len(temp_df2):
        print(df_name[:-22])
        print('Difference = ' + str(len(temp_df1) - len(temp_df2)))
        print('New count = ' + str(len(temp_df2)))

LARGE DELETIONS
A_RR_GBM809
Difference = 25
New count = 37
A_R_GBM607
Difference = 38
New count = 25
B_P_GBM593
Difference = 30
New count = 19
B_R_GBM898
Difference = 21
New count = 13
C_P_GBM577
Difference = 25
New count = 18
C_R_GBM625
Difference = 16
New count = 15
E_RR_GBM937
Difference = 54
New count = 53
E_R_GBM781
Difference = 21
New count = 21
F_P_GBM620
Difference = 28
New count = 33
F_R_GBM691
Difference = 36
New count = 29
G_P_GBM454
Difference = 33
New count = 21
G_R_GBM833
Difference = 38
New count = 19
H_P_GBM460
Difference = 48
New count = 36
H_R_GBM492
Difference = 44
New count = 31
I_P_GBM440
Difference = 46
New count = 58
I_R_GBM532
Difference = 24
New count = 15
J_P_GBM401
Difference = 32
New count = 48
J_RR_GBM551
Difference = 37
New count = 61
J_R_GBM498
Difference = 64
New count = 83
K_P_GBM529
Difference = 27
New count = 28
K_R_GBM832
Difference = 70
New count = 128
L_P_GBM618
Difference = 56
New count = 66
L_R_SMTB152
Difference = 72
New count = 146
M_P_GBM672
D

In [13]:
### Validation: large duplications

print('LARGE DUPLICATIONS')

for df_name in somatic_large_DUP_panel_df_names:
    
    temp_df1 = globals()[df_name]
    temp_df2 = globals()[df_name + '_wo_gap']
    
    if len(temp_df1) != len(temp_df2):
        print(df_name[:-22])
        print('Difference = ' + str(len(temp_df1) - len(temp_df2)))
        print('New count = ' + str(len(temp_df2)))

LARGE DUPLICATIONS
A_RR_GBM809
Difference = 5
New count = 80
A_R_GBM607
Difference = 11
New count = 53
B_P_GBM593
Difference = 9
New count = 48
B_R_GBM898
Difference = 13
New count = 49
C_P_GBM577
Difference = 43
New count = 69
C_R_GBM625
Difference = 13
New count = 41
E_RR_GBM937
Difference = 18
New count = 111
E_R_GBM781
Difference = 16
New count = 43
F_P_GBM620
Difference = 10
New count = 42
F_R_GBM691
Difference = 12
New count = 77
G_P_GBM454
Difference = 9
New count = 41
G_R_GBM833
Difference = 12
New count = 38
H_P_GBM460
Difference = 22
New count = 132
H_R_GBM492
Difference = 20
New count = 132
I_P_GBM440
Difference = 6
New count = 45
I_R_GBM532
Difference = 17
New count = 47
J_P_GBM401
Difference = 11
New count = 61
J_RR_GBM551
Difference = 12
New count = 95
J_R_GBM498
Difference = 9
New count = 79
K_P_GBM529
Difference = 11
New count = 41
K_R_GBM832
Difference = 13
New count = 222
L_P_GBM618
Difference = 8
New count = 77
L_R_SMTB152
Difference = 6
New count = 74
M_P_GBM672
Dif

In [14]:
### Validation: large inversions

print('LARGE INVERSIONS')

for df_name in somatic_large_INV_panel_df_names:
    
    temp_df1 = globals()[df_name]
    temp_df2 = globals()[df_name + '_wo_gap']
    
    if len(temp_df1) != len(temp_df2):
        print(df_name[:-22])
        print('Difference = ' + str(len(temp_df1) - len(temp_df2)))
        print('New count = ' + str(len(temp_df2)))

LARGE INVERSIONS
A_R_GBM607
Difference = 1
New count = 16
B_R_GBM898
Difference = 1
New count = 5
C_R_GBM625
Difference = 1
New count = 14
E_RR_GBM937
Difference = 1
New count = 20
F_P_GBM620
Difference = 2
New count = 7
F_R_GBM691
Difference = 1
New count = 28
J_P_GBM401
Difference = 1
New count = 14
N_R_GBM745
Difference = 1
New count = 20


In [None]:
'''
VARIABLE NAMES

somatic_small_del_panel_df_names
somatic_small_del_panel_wo_gap_df_names

somatic_large_DEL_panel_df_names
somatic_large_DEL_panel_wo_gap_df_names

somatic_large_DUP_panel_df_names
somatic_large_DUP_panel_wo_gap_df_names

somatic_large_INV_panel_df_names
somatic_large_INV_panel_wo_gap_df_names
'''

In [15]:
### Save updated somatic small deletions and large SVs ###

somatic_small_del_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel_wo_gap/small_dels'
somatic_large_sv_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel_wo_gap/large_svs'


os.chdir(somatic_small_del_path)

for df_name in somatic_small_del_panel_wo_gap_df_names:
    
    globals()[df_name].to_csv((somatic_small_del_path + '/' + df_name + '.csv'), index=False, sep=',')

    
os.chdir(somatic_large_sv_path)

for df_name in somatic_large_DEL_panel_wo_gap_df_names:
    
    globals()[df_name].to_csv((somatic_large_sv_path + '/DEL/' + df_name + '.csv'), index=False, sep=',')
    
for df_name in somatic_large_DUP_panel_wo_gap_df_names:
    
    globals()[df_name].to_csv((somatic_large_sv_path + '/DUP/' + df_name + '.csv'), index=False, sep=',')

for df_name in somatic_large_INV_panel_wo_gap_df_names:
    
    globals()[df_name].to_csv((somatic_large_sv_path + '/INV/' + df_name + '.csv'), index=False, sep=',')