In [2]:
## Python version 3.6, some packages are not forward compatible

import vcf
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [152]:
## Functions used below are imported from another notebook, run before changing directory

os.chdir('/Users/ryanyutian/Documents/GitHub/TRI_Brain_LRWGS')
%run filtering_for_somatic_SVs_functions.ipynb

In [4]:
## Change directory

print("Original working directory: {0}".format(os.getcwd()))

path = '/Users/ryanyutian/Desktop/TRI_Brain_glioma_vcf_raw/'

try:
    os.chdir(path)
    print("Current working directory: {0}".format(os.getcwd()))
except FileNotFoundError:
    print("Directory: {0} does not exist".format(path))
except NotADirectoryError:
    print("{0} is not a directory".format(path))
except PermissionError:
    print("You do not have permissions to change to {0}".format(path))

Original working directory: /Users/ryanyutian/Documents/GitHub/TRI_Brain_LRWGS
Current working directory: /Users/ryanyutian/Desktop/TRI_Brain_glioma_vcf_raw


# 1. Filtering for somatic SVs (small dels + large SVs)

In [5]:
### Load vcf files into variables ###

## Parse through directory to get patient ID (e.g., A/B/C/D...)
patients = sorted([i for i in os.listdir(path) if '_' not in i])

vcfread_names = {'small_dels':[], 'other_svs':[]}

for patient_id in patients:
    
    ## Change directory to patient directory
    temp_path = path + '/' + patient_id
    os.chdir(temp_path)
    
    ## Get names of the samples (e.g., P_GBM618)
    temp_sample_files = [i for i in os.listdir(temp_path) if '.' not in i]

    for sample in temp_sample_files:
        
        ## Change directory to sample directory containing all the vcf files
        temp_path2 = temp_path + '/' + sample
        os.chdir(temp_path2)
        
        vcf_reader_name_dels = patient_id + '_' + sample + '_vcf_dels'
        vcf_reader_name_svs = patient_id + '_' + sample + '_vcf_svs'

        globals()[vcf_reader_name_dels] = vcf.Reader(open('dels.vcf.gz', 'rb'))
        globals()[vcf_reader_name_svs] = vcf.Reader(open('large_svs.vcf.gz', 'rb')) 

        vcfread_names['small_dels'].append(vcf_reader_name_dels)
        vcfread_names['other_svs'].append(vcf_reader_name_svs)
        
        os.chdir(temp_path)

os.chdir(path)

In [6]:
### Extract small deletion info and small BND info from vcfreaders of small del vcf files ###

del_dict_names = {'dels': [], 'del_BNDs': []}

for vcfreader in vcfread_names['small_dels']:
    
    del_info = vcfreader[:-9] + '_dels'
    del_BND_info = vcfreader[:-9] + '_del_BNDs'
    
    globals()[del_info], globals()[del_BND_info] = \
    extract_small_del_info(globals()[vcfreader])

    del_dict_names['dels'].append(del_info)
    del_dict_names['del_BNDs'].append(del_BND_info)

In [7]:
### Add small deletion BNDs on the same chromosome to small deletion dictionaries ###

for del_BND_name in del_dict_names['del_BNDs']:
    
    temp_dict = combine_BNDs_on_same_chr(globals()[del_BND_name])
    
    for key in globals()[del_BND_name[:-9] + '_dels'].keys():
        
        globals()[del_BND_name[:-9] + '_dels'][key].extend(temp_dict[key])

In [8]:
### Extract large SV info and large BND info from vcfreaders of large SV vcf files ###

sv_dict_names = {'SVs': [], 'BNDs': []}

for vcfreader in vcfread_names['other_svs']:
    
    SV_info = vcfreader[:-8] + '_SVs'
    BND_info = vcfreader[:-8] + '_BNDs'
    
    globals()[SV_info], globals()[BND_info] = \
    extract_SV_info_wo_UNK(globals()[vcfreader])

    sv_dict_names['SVs'].append(SV_info)
    sv_dict_names['BNDs'].append(BND_info)

In [9]:
### Add large SV BNDs on the same chromosome to large SV dictionaries ###

for sv_BND_name in sv_dict_names['BNDs']:
    
    temp_dict = combine_BNDs_on_same_chr(globals()[sv_BND_name])
    
    for key in globals()[sv_BND_name[:-5] + '_SVs'].keys():
        
        globals()[sv_BND_name[:-5] + '_SVs'][key].extend(temp_dict[key])

In [10]:
### Eliminate all SVs tagged by the built-in filters of Long Ranger ###

small_del_passed_dict_names = []
small_del_failed_dict_names = []

for del_dict_name in del_dict_names['dels']:
    
    temp_pass_name = del_dict_name + '_pass'
    temp_fail_name = del_dict_name + '_fail'

    globals()[temp_pass_name], globals()[temp_fail_name] = \
    split_sv_by_filter(globals()[del_dict_name])
    
    small_del_passed_dict_names.append(temp_pass_name)
    small_del_failed_dict_names.append(temp_fail_name)

#######################################################################

large_sv_passed_dict_names = []
large_sv_failed_dict_names = []

for sv_dict_name in sv_dict_names['SVs']:
    
    temp_pass_name = sv_dict_name + '_pass'
    temp_fail_name = sv_dict_name + '_fail'

    globals()[temp_pass_name], globals()[temp_fail_name] = \
    split_sv_by_filter(globals()[sv_dict_name])
    
    large_sv_passed_dict_names.append(temp_pass_name)
    large_sv_failed_dict_names.append(temp_fail_name)

In [11]:
# Excluding no samples

excluded_patient_IDs = []
analysis_patient_IDs = [i for i in patients if i not in excluded_patient_IDs]

In [12]:
# Sort small dels, small del BNDs, large SVs, and large BNDs by patient IDs

del_sample_dict = sort_sample_by_patient_and_type(analysis_patient_IDs, small_del_passed_dict_names)
sv_sample_dict = sort_sample_by_patient_and_type(analysis_patient_IDs, large_sv_passed_dict_names)

In [13]:
del_sample_dict

{'A': {'NORM': ['A_norm_G809_dels_pass'],
  'TUMOUR': ['A_R_GBM607_dels_pass', 'A_RR_GBM809_dels_pass']},
 'B': {'NORM': [], 'TUMOUR': ['B_P_GBM593_dels_pass', 'B_R_GBM898_dels_pass']},
 'C': {'NORM': [], 'TUMOUR': ['C_P_GBM577_dels_pass', 'C_R_GBM625_dels_pass']},
 'E': {'NORM': ['E_norm_SMTB211_dels_pass'],
  'TUMOUR': ['E_RR_GBM937_dels_pass', 'E_R_GBM781_dels_pass']},
 'F': {'NORM': [], 'TUMOUR': ['F_P_GBM620_dels_pass', 'F_R_GBM691_dels_pass']},
 'G': {'NORM': ['G_norm_BT_2009038_dels_pass'],
  'TUMOUR': ['G_R_GBM833_dels_pass', 'G_P_GBM454_dels_pass']},
 'H': {'NORM': ['H_norm_BT_2010140_dels_pass'],
  'TUMOUR': ['H_R_GBM492_dels_pass', 'H_P_GBM460_dels_pass']},
 'I': {'NORM': [], 'TUMOUR': ['I_R_GBM532_dels_pass', 'I_P_GBM440_dels_pass']},
 'J': {'NORM': [],
  'TUMOUR': ['J_R_GBM498_dels_pass',
   'J_RR_GBM551_dels_pass',
   'J_P_GBM401_dels_pass']},
 'K': {'NORM': [], 'TUMOUR': ['K_R_GBM832_dels_pass', 'K_P_GBM529_dels_pass']},
 'L': {'NORM': ['L_norm_SMTB152_blood_dels_pass'],

In [14]:
for p in del_sample_dict:
    
    if len(del_sample_dict[p]['NORM']) != 0:
        
        print(del_sample_dict[p]['NORM'][0])

A_norm_G809_dels_pass
E_norm_SMTB211_dels_pass
G_norm_BT_2009038_dels_pass
H_norm_BT_2010140_dels_pass
L_norm_SMTB152_blood_dels_pass
O_norm_SMTB781_blood_dels_pass
P_norm_SMTB123_blood_dels_pass
Q_norm_SMTB665_blood_dels_pass


In [15]:
sv_sample_dict

{'A': {'NORM': ['A_norm_G809_SVs_pass'],
  'TUMOUR': ['A_R_GBM607_SVs_pass', 'A_RR_GBM809_SVs_pass']},
 'B': {'NORM': [], 'TUMOUR': ['B_P_GBM593_SVs_pass', 'B_R_GBM898_SVs_pass']},
 'C': {'NORM': [], 'TUMOUR': ['C_P_GBM577_SVs_pass', 'C_R_GBM625_SVs_pass']},
 'E': {'NORM': ['E_norm_SMTB211_SVs_pass'],
  'TUMOUR': ['E_RR_GBM937_SVs_pass', 'E_R_GBM781_SVs_pass']},
 'F': {'NORM': [], 'TUMOUR': ['F_P_GBM620_SVs_pass', 'F_R_GBM691_SVs_pass']},
 'G': {'NORM': ['G_norm_BT_2009038_SVs_pass'],
  'TUMOUR': ['G_R_GBM833_SVs_pass', 'G_P_GBM454_SVs_pass']},
 'H': {'NORM': ['H_norm_BT_2010140_SVs_pass'],
  'TUMOUR': ['H_R_GBM492_SVs_pass', 'H_P_GBM460_SVs_pass']},
 'I': {'NORM': [], 'TUMOUR': ['I_R_GBM532_SVs_pass', 'I_P_GBM440_SVs_pass']},
 'J': {'NORM': [],
  'TUMOUR': ['J_R_GBM498_SVs_pass',
   'J_RR_GBM551_SVs_pass',
   'J_P_GBM401_SVs_pass']},
 'K': {'NORM': [], 'TUMOUR': ['K_R_GBM832_SVs_pass', 'K_P_GBM529_SVs_pass']},
 'L': {'NORM': ['L_norm_SMTB152_blood_SVs_pass'],
  'TUMOUR': ['L_P_GBM618_

# 2. New normal samples

In [16]:
## Change directory

print("Original working directory: {0}".format(os.getcwd()))

path = '/Users/ryanyutian/Desktop/McGill_Normal_Samples/'

try:
    os.chdir(path)
    print("Current working directory: {0}".format(os.getcwd()))
except FileNotFoundError:
    print("Directory: {0} does not exist".format(path))
except NotADirectoryError:
    print("{0} is not a directory".format(path))
except PermissionError:
    print("You do not have permissions to change to {0}".format(path))

Original working directory: /Users/ryanyutian/Desktop/TRI_Brain_glioma_vcf_raw
Current working directory: /Users/ryanyutian/Desktop/McGill_Normal_Samples


In [19]:
### Load vcf files into variables ###

## Parse through directory to get sample ID
new_samples = sorted([i for i in os.listdir(path) if 'DS_Store' not in i])

new_vcfread_names = {'small_dels':[], 'other_svs':[]}

for sample_id in new_samples:
    
    ## Change directory to sample directory
    temp_path = path + '/' + sample_id
    os.chdir(temp_path)
    
    vcf_reader_name_dels = sample_id + '_vcf_dels'
    vcf_reader_name_svs = sample_id + '_vcf_svs'

    globals()[vcf_reader_name_dels] = vcf.Reader(open('dels.vcf.gz', 'rb'))
    globals()[vcf_reader_name_svs] = vcf.Reader(open('large_svs.vcf.gz', 'rb')) 

    new_vcfread_names['small_dels'].append(vcf_reader_name_dels)
    new_vcfread_names['other_svs'].append(vcf_reader_name_svs)

os.chdir(path)

In [26]:
noalert_samples = ['EPT2115_Blood', \
                  'HSJ-051R1_blood', \
                  'HSJ-075_blood', \
                  'HSJ-076_blood', \
                  'HSJ-078_blood', \
                  'HSJ-130_Blood', \
                  'HSJ-142_Blood', \
                  'HSJ-173_Blood', \
                  'HSJ-184_Blood', \
                  'HSJ-192_Blood', \
                  'HSJ-200_Normal', \
                  'HSJ-21_blood', \
                  'JN-28_Normal', \
                  'MDT-AP-2130_Blood', \
                  'MDT-AP-2673_Blood', \
                  'MDT-AP-2859_Blood', \
                  'pHGG-02_Normal']

onealert_samples = ['BL521', \
                    'BL74', \
                    'EPT2105_Blood', \
                    'MDT-AP-1206_Blood']

usable_samples = ['EPT2115_Blood', \
                  'HSJ-051R1_blood', \
                  'HSJ-075_blood', \
                  'HSJ-076_blood', \
                  'HSJ-078_blood', \
                  'HSJ-130_Blood', \
                  'HSJ-142_Blood', \
                  'HSJ-173_Blood', \
                  'HSJ-184_Blood', \
                  'HSJ-192_Blood', \
                  'HSJ-200_Normal', \
                  'HSJ-21_blood', \
                  'JN-28_Normal', \
                  'MDT-AP-2130_Blood', \
                  'MDT-AP-2673_Blood', \
                  'MDT-AP-2859_Blood', \
                  'pHGG-02_Normal', \
                  'BL521', \
                  'BL74', \
                  'EPT2105_Blood', \
                  'MDT-AP-1206_Blood']

In [28]:
usable_vcfread_names = {'small_dels': [], 'other_svs':[]}


for name in new_vcfread_names['small_dels']:
    
    if name[:-9] in usable_samples:
        
        usable_vcfread_names['small_dels'].append(name)

        
for name in new_vcfread_names['other_svs']:
    
    if name[:-8] in usable_samples:
        
        usable_vcfread_names['other_svs'].append(name)


In [29]:
usable_vcfread_names

{'small_dels': ['BL521_vcf_dels',
  'BL74_vcf_dels',
  'EPT2105_Blood_vcf_dels',
  'EPT2115_Blood_vcf_dels',
  'HSJ-051R1_blood_vcf_dels',
  'HSJ-075_blood_vcf_dels',
  'HSJ-076_blood_vcf_dels',
  'HSJ-078_blood_vcf_dels',
  'HSJ-130_Blood_vcf_dels',
  'HSJ-142_Blood_vcf_dels',
  'HSJ-173_Blood_vcf_dels',
  'HSJ-184_Blood_vcf_dels',
  'HSJ-192_Blood_vcf_dels',
  'HSJ-200_Normal_vcf_dels',
  'HSJ-21_blood_vcf_dels',
  'JN-28_Normal_vcf_dels',
  'MDT-AP-1206_Blood_vcf_dels',
  'MDT-AP-2130_Blood_vcf_dels',
  'MDT-AP-2673_Blood_vcf_dels',
  'MDT-AP-2859_Blood_vcf_dels',
  'pHGG-02_Normal_vcf_dels'],
 'other_svs': ['BL521_vcf_svs',
  'BL74_vcf_svs',
  'EPT2105_Blood_vcf_svs',
  'EPT2115_Blood_vcf_svs',
  'HSJ-051R1_blood_vcf_svs',
  'HSJ-075_blood_vcf_svs',
  'HSJ-076_blood_vcf_svs',
  'HSJ-078_blood_vcf_svs',
  'HSJ-130_Blood_vcf_svs',
  'HSJ-142_Blood_vcf_svs',
  'HSJ-173_Blood_vcf_svs',
  'HSJ-184_Blood_vcf_svs',
  'HSJ-192_Blood_vcf_svs',
  'HSJ-200_Normal_vcf_svs',
  'HSJ-21_blood_vcf

In [32]:
### Extract small deletion info and small BND info from vcfreaders of the new normal samples ###

new_normal_del_dict_names = {'dels': [], 'del_BNDs': []}

for vcfreader in usable_vcfread_names['small_dels']:
    
    del_info = vcfreader[:-9] + '_dels'
    del_BND_info = vcfreader[:-9] + '_del_BNDs'
    
    globals()[del_info], globals()[del_BND_info] = \
    extract_small_del_info(globals()[vcfreader])

    new_normal_del_dict_names['dels'].append(del_info)
    new_normal_del_dict_names['del_BNDs'].append(del_BND_info)

In [34]:
### Add small deletion BNDs on the same chromosome to small deletion dictionaries ###

for del_BND_name in new_normal_del_dict_names['del_BNDs']:
    
    temp_dict = combine_BNDs_on_same_chr(globals()[del_BND_name])
    
    for key in globals()[del_BND_name[:-9] + '_dels'].keys():
        
        globals()[del_BND_name[:-9] + '_dels'][key].extend(temp_dict[key])

In [35]:
### Extract large SV info and large BND info from vcfreaders of the new normal samples ###

new_normal_sv_dict_names = {'SVs': [], 'BNDs': []}

for vcfreader in usable_vcfread_names['other_svs']:
    
    SV_info = vcfreader[:-8] + '_SVs'
    BND_info = vcfreader[:-8] + '_BNDs'
    
    globals()[SV_info], globals()[BND_info] = \
    extract_SV_info_wo_UNK(globals()[vcfreader])

    new_normal_sv_dict_names['SVs'].append(SV_info)
    new_normal_sv_dict_names['BNDs'].append(BND_info)

In [36]:
### Add large SV BNDs on the same chromosome to large SV dictionaries ###

for sv_BND_name in new_normal_sv_dict_names['BNDs']:
    
    temp_dict = combine_BNDs_on_same_chr(globals()[sv_BND_name])
    
    for key in globals()[sv_BND_name[:-5] + '_SVs'].keys():
        
        globals()[sv_BND_name[:-5] + '_SVs'][key].extend(temp_dict[key])

In [38]:
### Eliminate all SVs tagged by the built-in filters of Long Ranger ###

new_normal_small_del_passed_dict_names = []
new_normal_small_del_failed_dict_names = []

for del_dict_name in new_normal_del_dict_names['dels']:
    
    temp_pass_name = del_dict_name + '_pass'
    temp_fail_name = del_dict_name + '_fail'

    globals()[temp_pass_name], globals()[temp_fail_name] = \
    split_sv_by_filter(globals()[del_dict_name])
    
    new_normal_small_del_passed_dict_names.append(temp_pass_name)
    new_normal_small_del_failed_dict_names.append(temp_fail_name)

#######################################################################

new_normal_large_sv_passed_dict_names = []
new_normal_large_sv_failed_dict_names = []

for sv_dict_name in new_normal_sv_dict_names['SVs']:
    
    temp_pass_name = sv_dict_name + '_pass'
    temp_fail_name = sv_dict_name + '_fail'

    globals()[temp_pass_name], globals()[temp_fail_name] = \
    split_sv_by_filter(globals()[sv_dict_name])
    
    new_normal_large_sv_passed_dict_names.append(temp_pass_name)
    new_normal_large_sv_failed_dict_names.append(temp_fail_name)

In [42]:
### Save new normal SVs based on their status of the Long Ranger built-in filters ###

new_normal_small_del_filter_pass_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/filter_pass/small_dels'
new_normal_small_del_filter_fail_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/filter_fail/small_dels'
new_normal_large_sv_filter_pass_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/filter_pass/large_svs'
new_normal_large_sv_filter_fail_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/filter_fail/large_svs'


os.chdir(new_normal_small_del_filter_pass_path)

for dict_name in new_normal_small_del_passed_dict_names:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((new_normal_small_del_filter_pass_path + '/' + dict_name + '.csv'), index=False, sep=',')

    
os.chdir(new_normal_small_del_filter_fail_path)

for dict_name in new_normal_small_del_failed_dict_names:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((new_normal_small_del_filter_fail_path + '/' + dict_name + '.csv'), index=False, sep=',')

    
os.chdir(new_normal_large_sv_filter_pass_path)

for dict_name in new_normal_large_sv_passed_dict_names:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((new_normal_large_sv_filter_pass_path + '/' + dict_name + '.csv'), index=False, sep=',')


os.chdir(new_normal_large_sv_filter_fail_path)

for dict_name in new_normal_large_sv_failed_dict_names:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((new_normal_large_sv_filter_fail_path + '/' + dict_name + '.csv'), index=False, sep=',')

# 3. Filtering

In [45]:
### Create new panel of normals ###

# Small deletions

small_del_norm_panel = {}

for patient_id in del_sample_dict:
    
    if len(del_sample_dict[patient_id]['NORM']) != 0:
        
        temp_norm_dict = globals()[del_sample_dict[patient_id]['NORM'][0]]
        
        for key in temp_norm_dict.keys():
            
            if key not in small_del_norm_panel.keys():
                
                small_del_norm_panel[key] = []
            
            small_del_norm_panel[key].extend(temp_norm_dict[key])


for sample_name in new_normal_small_del_passed_dict_names:
    
    temp_norm_dict = globals()[sample_name]
    
    for key in temp_norm_dict.keys():

        if key not in small_del_norm_panel.keys():

            small_del_norm_panel[key] = []

        small_del_norm_panel[key].extend(temp_norm_dict[key])

In [48]:
# Large SVs

large_sv_norm_panel = {}


for patient_id in sv_sample_dict:
    
    if len(sv_sample_dict[patient_id]['NORM']) != 0:
        
        temp_norm_dict = globals()[sv_sample_dict[patient_id]['NORM'][0]]
        
        for key in temp_norm_dict.keys():
            
            if key not in large_sv_norm_panel.keys():
                
                large_sv_norm_panel[key] = []
            
            large_sv_norm_panel[key].extend(temp_norm_dict[key])
            
            
for sample_name in new_normal_large_sv_passed_dict_names:
    
    temp_norm_dict = globals()[sample_name]
    
    for key in temp_norm_dict.keys():

        if key not in large_sv_norm_panel.keys():

            large_sv_norm_panel[key] = []

        large_sv_norm_panel[key].extend(temp_norm_dict[key])

In [50]:
### Save new panel of normals

panel_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/panel'
os.chdir(panel_path)

temp_df = pd.DataFrame.from_dict(small_del_norm_panel)
temp_df.to_csv((panel_path + '/small_del_norm_panel.csv'), index=False, sep=',')


DEL_norm, INV_norm, DUP_norm = split_DEL_INV_DUP(large_sv_norm_panel)

temp_df = pd.DataFrame.from_dict(DEL_norm)
temp_df.to_csv((panel_path + '/large_del_norm_panel.csv'), index=False, sep=',')

temp_df = pd.DataFrame.from_dict(DUP_norm)
temp_df.to_csv((panel_path + '/large_dup_norm_panel.csv'), index=False, sep=',')

temp_df = pd.DataFrame.from_dict(INV_norm)
temp_df.to_csv((panel_path + '/large_inv_norm_panel.csv'), index=False, sep=',')

In [168]:
### Save new panel of normals separately ###

del_panel_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/panel_separated/small_dels'

# Small deletions

for patient_id in del_sample_dict:
    
    if len(del_sample_dict[patient_id]['NORM']) != 0:
        
        temp_norm_dict = globals()[del_sample_dict[patient_id]['NORM'][0]]
        temp_df = pd.DataFrame.from_dict(temp_norm_dict)
        temp_df.to_csv((del_panel_path + '/' + del_sample_dict[patient_id]['NORM'][0][:-5] + '.csv'), index=False, sep=',')

        
for sample_name in new_normal_small_del_passed_dict_names:
    
    temp_norm_dict = globals()[sample_name]
    temp_df = pd.DataFrame.from_dict(temp_norm_dict)
    temp_df.to_csv((del_panel_path + '/' + sample_name[:-5] + '.csv'), index=False, sep=',')

In [169]:
### Save new panel of normals separately ###

sv_panel_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/panel_separated/large_svs'


# Large SVs


for patient_id in sv_sample_dict:
    
    if len(sv_sample_dict[patient_id]['NORM']) != 0:
        
        temp_norm_dict = globals()[sv_sample_dict[patient_id]['NORM'][0]]
        
        temp_DEL_norm, temp_INV_norm, temp_DUP_norm = split_DEL_INV_DUP(temp_norm_dict)

        # DEL
        temp_df = pd.DataFrame.from_dict(temp_DEL_norm)
        temp_df.to_csv((sv_panel_path + '/DEL/' + \
                        sv_sample_dict[patient_id]['NORM'][0][:-5] + '_DEL.csv'), index=False, sep=',')

        # DUP
        temp_df = pd.DataFrame.from_dict(temp_DUP_norm)
        temp_df.to_csv((sv_panel_path + '/DUP/' + \
                        sv_sample_dict[patient_id]['NORM'][0][:-5] + '_DUP.csv'), index=False, sep=',')

        # INV
        temp_df = pd.DataFrame.from_dict(temp_INV_norm)
        temp_df.to_csv((sv_panel_path + '/INV/' + \
                        sv_sample_dict[patient_id]['NORM'][0][:-5] + '_INV.csv'), index=False, sep=',')


        
for sample_name in new_normal_large_sv_passed_dict_names:
    
    temp_norm_dict = globals()[sample_name]
    
    temp_DEL_norm, temp_INV_norm, temp_DUP_norm = split_DEL_INV_DUP(temp_norm_dict)
    
    # DEL
    temp_df = pd.DataFrame.from_dict(temp_DEL_norm)
    temp_df.to_csv((sv_panel_path + '/DEL/' + \
                    sample_name[:-5] + '_DEL.csv'), index=False, sep=',')

    # DUP
    temp_df = pd.DataFrame.from_dict(temp_DUP_norm)
    temp_df.to_csv((sv_panel_path + '/DUP/' + \
                    sample_name[:-5] + '_DUP.csv'), index=False, sep=',')

    # INV
    temp_df = pd.DataFrame.from_dict(temp_INV_norm)
    temp_df.to_csv((sv_panel_path + '/INV/' + \
                    sample_name[:-5] + '_INV.csv'), index=False, sep=',')


In [170]:
dist_cutoff_del = 20
dist_cutoff_sv = 1000

somatic_dict_names = {'large_svs': {'DEL': [], 'DUP': [], 'INV': []}, 'dels': []}
germline_dict_names = {'large_svs': {'DEL': [], 'DUP': [], 'INV': []}, 'dels': []}
dist_dict_names = {'large_svs': {'DEL': [], 'DUP': [], 'INV': []}, 'dels': []}

In [154]:
### Filtering for somatic small deletions using panel of normals ###

for p_id in del_sample_dict:
                       
    for sample in del_sample_dict[p_id]['TUMOUR']:

        print('Currently Filtering: ' + sample)
        temp_somatic_dict = sample[:-10] + '_somatic_dels'
        temp_germline_dict = sample[:-10] + '_germline_dels'
        temp_dist_dict = sample[:-10] + '_dels_dist_dict'

        globals()[temp_somatic_dict], globals()[temp_germline_dict], globals()[temp_dist_dict] = \
        somatic_sv_call_by_nearest_normal(small_del_norm_panel, \
                                          globals()[sample], dist_cutoff_del)

        somatic_dict_names['dels'].append(temp_somatic_dict)
        germline_dict_names['dels'].append(temp_germline_dict)
        dist_dict_names['dels'].append(temp_dist_dict)

Currently Filtering: A_R_GBM607_dels_pass
Currently Filtering: A_RR_GBM809_dels_pass
Currently Filtering: B_P_GBM593_dels_pass
Currently Filtering: B_R_GBM898_dels_pass
Currently Filtering: C_P_GBM577_dels_pass
Currently Filtering: C_R_GBM625_dels_pass
Currently Filtering: E_RR_GBM937_dels_pass
Currently Filtering: E_R_GBM781_dels_pass
Currently Filtering: F_P_GBM620_dels_pass
Currently Filtering: F_R_GBM691_dels_pass
Currently Filtering: G_R_GBM833_dels_pass
Currently Filtering: G_P_GBM454_dels_pass
Currently Filtering: H_R_GBM492_dels_pass
Currently Filtering: H_P_GBM460_dels_pass
Currently Filtering: I_R_GBM532_dels_pass
Currently Filtering: I_P_GBM440_dels_pass
Currently Filtering: J_R_GBM498_dels_pass
Currently Filtering: J_RR_GBM551_dels_pass
Currently Filtering: J_P_GBM401_dels_pass
Currently Filtering: K_R_GBM832_dels_pass
Currently Filtering: K_P_GBM529_dels_pass
Currently Filtering: L_P_GBM618_dels_pass
Currently Filtering: L_R_SMTB152_dels_pass
Currently Filtering: M_P_GBM67

In [171]:
### Filtering for somatic large SVs using panel of normals ###

for p_id in sv_sample_dict:
            
    for sample in sv_sample_dict[p_id]['TUMOUR']:
        
        print('Currently Filtering: ' + sample)

        temp_DEL, temp_INV, temp_DUP = split_DEL_INV_DUP(globals()[sample])
        temp_DEL_norm, temp_INV_norm, temp_DUP_norm = \
        split_DEL_INV_DUP(large_sv_norm_panel)

        # DELs
        temp_somatic_DEL_dict = sample[:-9] + '_somatic_SV_DELs'
        temp_germline_DEL_dict = sample[:-9] + '_germline_SV_DELs'
        temp_DEL_dist_dict = sample[:-9] + '_SV_DELs_dist_dict'

        globals()[temp_somatic_DEL_dict], globals()[temp_germline_DEL_dict], globals()[temp_DEL_dist_dict] = \
        somatic_sv_call_by_nearest_normal(temp_DEL_norm, temp_DEL, dist_cutoff_sv)

        somatic_dict_names['large_svs']['DEL'].append(temp_somatic_DEL_dict)
        germline_dict_names['large_svs']['DEL'].append(temp_germline_DEL_dict)
        dist_dict_names['large_svs']['DEL'].append(temp_DEL_dist_dict)

        # INVs
        temp_somatic_INV_dict = sample[:-9] + '_somatic_SV_INVs'
        temp_germline_INV_dict = sample[:-9] + '_germline_SV_INVs'
        temp_INV_dist_dict = sample[:-9] + '_SV_INVs_dist_dict'

        globals()[temp_somatic_INV_dict], globals()[temp_germline_INV_dict], globals()[temp_INV_dist_dict] = \
        somatic_sv_call_by_nearest_normal(temp_INV_norm, temp_INV, dist_cutoff_sv)

        somatic_dict_names['large_svs']['INV'].append(temp_somatic_INV_dict)
        germline_dict_names['large_svs']['INV'].append(temp_germline_INV_dict)
        dist_dict_names['large_svs']['INV'].append(temp_INV_dist_dict)

        # DUPs
        temp_somatic_DUP_dict = sample[:-9] + '_somatic_SV_DUPs'
        temp_germline_DUP_dict = sample[:-9] + '_germline_SV_DUPs'
        temp_DUP_dist_dict = sample[:-9] + '_SV_DUPs_dist_dict'

        globals()[temp_somatic_DUP_dict], globals()[temp_germline_DUP_dict], globals()[temp_DUP_dist_dict] = \
        somatic_sv_call_by_nearest_normal(temp_DUP_norm, temp_DUP, dist_cutoff_sv)

        somatic_dict_names['large_svs']['DUP'].append(temp_somatic_DUP_dict)
        germline_dict_names['large_svs']['DUP'].append(temp_germline_DUP_dict)
        dist_dict_names['large_svs']['DUP'].append(temp_DUP_dist_dict)

Currently Filtering: A_R_GBM607_SVs_pass
Currently Filtering: A_RR_GBM809_SVs_pass
Currently Filtering: B_P_GBM593_SVs_pass
Currently Filtering: B_R_GBM898_SVs_pass
Currently Filtering: C_P_GBM577_SVs_pass
Currently Filtering: C_R_GBM625_SVs_pass
Currently Filtering: E_RR_GBM937_SVs_pass
Currently Filtering: E_R_GBM781_SVs_pass
Currently Filtering: F_P_GBM620_SVs_pass
Currently Filtering: F_R_GBM691_SVs_pass
Currently Filtering: G_R_GBM833_SVs_pass
Currently Filtering: G_P_GBM454_SVs_pass
Currently Filtering: H_R_GBM492_SVs_pass
Currently Filtering: H_P_GBM460_SVs_pass
Currently Filtering: I_R_GBM532_SVs_pass
Currently Filtering: I_P_GBM440_SVs_pass
Currently Filtering: J_R_GBM498_SVs_pass
Currently Filtering: J_RR_GBM551_SVs_pass
Currently Filtering: J_P_GBM401_SVs_pass
Currently Filtering: K_R_GBM832_SVs_pass
Currently Filtering: K_P_GBM529_SVs_pass
Currently Filtering: L_P_GBM618_SVs_pass
Currently Filtering: L_R_SMTB152_SVs_pass
Currently Filtering: M_P_GBM672_SVs_pass
Currently Fi

In [156]:
### Save filtered somatic and germline small deletions and large SVs ###

somatic_small_del_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/somatic_panel/small_dels'
germline_small_del_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/germline_panel/small_dels'

somatic_large_sv_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/somatic_panel/large_svs'
germline_large_sv_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/germline_panel/large_svs'


os.chdir(somatic_small_del_path)

for dict_name in somatic_dict_names['dels']:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((somatic_small_del_path + '/' + dict_name + '.csv'), index=False, sep=',')

    
os.chdir(germline_small_del_path)

for dict_name in germline_dict_names['dels']:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((germline_small_del_path + '/' + dict_name + '.csv'), index=False, sep=',')

    
os.chdir(somatic_large_sv_path)

for DEL_dict_name in somatic_dict_names['large_svs']['DEL']:
    
    temp_DEL_df = pd.DataFrame.from_dict(globals()[DEL_dict_name])
    temp_DEL_df.to_csv((somatic_large_sv_path + '/DEL/' + DEL_dict_name + '.csv'), index=False, sep=',')
    
    temp_DUP_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'DUPs'])
    temp_DUP_df.to_csv((somatic_large_sv_path + '/DUP/' + DEL_dict_name[:-4] + 'DUPs' + '.csv'), index=False, sep=',')

    temp_INV_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'INVs'])
    temp_INV_df.to_csv((somatic_large_sv_path + '/INV/' + DEL_dict_name[:-4] + 'INVs' + '.csv'), index=False, sep=',')


os.chdir(germline_large_sv_path)

for DEL_dict_name in germline_dict_names['large_svs']['DEL']:
    
    temp_DEL_df = pd.DataFrame.from_dict(globals()[DEL_dict_name])
    temp_DEL_df.to_csv((germline_large_sv_path + '/DEL/' + DEL_dict_name + '.csv'), index=False, sep=',')
    
    temp_DUP_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'DUPs'])
    temp_DUP_df.to_csv((germline_large_sv_path + '/DUP/' + DEL_dict_name[:-4] + 'DUPs' + '.csv'), index=False, sep=',')

    temp_INV_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'INVs'])
    temp_INV_df.to_csv((germline_large_sv_path + '/INV/' + DEL_dict_name[:-4] + 'INVs' + '.csv'), index=False, sep=',')

# 4. Control

In [93]:
dist_cutoff_del = 20
dist_cutoff_sv = 300

control_somatic_dict_names = {'large_svs': {'DEL': [], 'DUP': [], 'INV': []}, 'dels': []}
control_germline_dict_names = {'large_svs': {'DEL': [], 'DUP': [], 'INV': []}, 'dels': []}
control_dist_dict_names = {'large_svs': {'DEL': [], 'DUP': [], 'INV': []}, 'dels': []}

In [99]:
### Filtering for normal samples of small deletions using panel of normals ###

for sample in new_normal_small_del_passed_dict_names:
                       
    print('Currently Filtering: ' + sample)
    temp_somatic_dict = sample[:-10] + '_somatic_dels'
    temp_germline_dict = sample[:-10] + '_germline_dels'
    temp_dist_dict = sample[:-10] + '_dels_dist_dict'

    globals()[temp_somatic_dict], globals()[temp_germline_dict], globals()[temp_dist_dict] = \
    somatic_sv_call_by_nearest_normal(small_del_norm_panel, \
                                      globals()[sample], dist_cutoff_del)

    control_somatic_dict_names['dels'].append(temp_somatic_dict)
    control_germline_dict_names['dels'].append(temp_germline_dict)
    control_dist_dict_names['dels'].append(temp_dist_dict)

Currently Filtering: BL521_dels_pass
Currently Filtering: BL74_dels_pass
Currently Filtering: EPT2105_Blood_dels_pass
Currently Filtering: EPT2115_Blood_dels_pass
Currently Filtering: HSJ-051R1_blood_dels_pass
Currently Filtering: HSJ-075_blood_dels_pass
Currently Filtering: HSJ-076_blood_dels_pass
Currently Filtering: HSJ-078_blood_dels_pass
Currently Filtering: HSJ-130_Blood_dels_pass
Currently Filtering: HSJ-142_Blood_dels_pass
Currently Filtering: HSJ-173_Blood_dels_pass
Currently Filtering: HSJ-184_Blood_dels_pass
Currently Filtering: HSJ-192_Blood_dels_pass
Currently Filtering: HSJ-200_Normal_dels_pass
Currently Filtering: HSJ-21_blood_dels_pass
Currently Filtering: JN-28_Normal_dels_pass
Currently Filtering: MDT-AP-1206_Blood_dels_pass
Currently Filtering: MDT-AP-2130_Blood_dels_pass
Currently Filtering: MDT-AP-2673_Blood_dels_pass
Currently Filtering: MDT-AP-2859_Blood_dels_pass
Currently Filtering: pHGG-02_Normal_dels_pass


In [100]:
### Filtering for somatic large SVs using panel of normals ###

for sample in new_normal_large_sv_passed_dict_names:
            
    print('Currently Filtering: ' + sample)

    temp_DEL, temp_INV, temp_DUP = split_DEL_INV_DUP(globals()[sample])
    temp_DEL_norm, temp_INV_norm, temp_DUP_norm = \
    split_DEL_INV_DUP(large_sv_norm_panel)

    # DELs
    temp_somatic_DEL_dict = sample[:-9] + '_somatic_SV_DELs'
    temp_germline_DEL_dict = sample[:-9] + '_germline_SV_DELs'
    temp_DEL_dist_dict = sample[:-9] + '_SV_DELs_dist_dict'

    globals()[temp_somatic_DEL_dict], globals()[temp_germline_DEL_dict], globals()[temp_DEL_dist_dict] = \
    somatic_sv_call_by_nearest_normal(temp_DEL_norm, temp_DEL, dist_cutoff_sv)

    control_somatic_dict_names['large_svs']['DEL'].append(temp_somatic_DEL_dict)
    control_germline_dict_names['large_svs']['DEL'].append(temp_germline_DEL_dict)
    control_dist_dict_names['large_svs']['DEL'].append(temp_DEL_dist_dict)

    # INVs
    temp_somatic_INV_dict = sample[:-9] + '_somatic_SV_INVs'
    temp_germline_INV_dict = sample[:-9] + '_germline_SV_INVs'
    temp_INV_dist_dict = sample[:-9] + '_SV_INVs_dist_dict'

    globals()[temp_somatic_INV_dict], globals()[temp_germline_INV_dict], globals()[temp_INV_dist_dict] = \
    somatic_sv_call_by_nearest_normal(temp_INV_norm, temp_INV, dist_cutoff_sv)

    control_somatic_dict_names['large_svs']['INV'].append(temp_somatic_INV_dict)
    control_germline_dict_names['large_svs']['INV'].append(temp_germline_INV_dict)
    control_dist_dict_names['large_svs']['INV'].append(temp_INV_dist_dict)

    # DUPs
    temp_somatic_DUP_dict = sample[:-9] + '_somatic_SV_DUPs'
    temp_germline_DUP_dict = sample[:-9] + '_germline_SV_DUPs'
    temp_DUP_dist_dict = sample[:-9] + '_SV_DUPs_dist_dict'

    globals()[temp_somatic_DUP_dict], globals()[temp_germline_DUP_dict], globals()[temp_DUP_dist_dict] = \
    somatic_sv_call_by_nearest_normal(temp_DUP_norm, temp_DUP, dist_cutoff_sv)

    control_somatic_dict_names['large_svs']['DUP'].append(temp_somatic_DUP_dict)
    control_germline_dict_names['large_svs']['DUP'].append(temp_germline_DUP_dict)
    control_dist_dict_names['large_svs']['DUP'].append(temp_DUP_dist_dict)

Currently Filtering: BL521_SVs_pass
Currently Filtering: BL74_SVs_pass
Currently Filtering: EPT2105_Blood_SVs_pass
Currently Filtering: EPT2115_Blood_SVs_pass
Currently Filtering: HSJ-051R1_blood_SVs_pass
Currently Filtering: HSJ-075_blood_SVs_pass
Currently Filtering: HSJ-076_blood_SVs_pass
Currently Filtering: HSJ-078_blood_SVs_pass
Currently Filtering: HSJ-130_Blood_SVs_pass
Currently Filtering: HSJ-142_Blood_SVs_pass
Currently Filtering: HSJ-173_Blood_SVs_pass
Currently Filtering: HSJ-184_Blood_SVs_pass
Currently Filtering: HSJ-192_Blood_SVs_pass
Currently Filtering: HSJ-200_Normal_SVs_pass
Currently Filtering: HSJ-21_blood_SVs_pass
Currently Filtering: JN-28_Normal_SVs_pass
Currently Filtering: MDT-AP-1206_Blood_SVs_pass
Currently Filtering: MDT-AP-2130_Blood_SVs_pass
Currently Filtering: MDT-AP-2673_Blood_SVs_pass
Currently Filtering: MDT-AP-2859_Blood_SVs_pass
Currently Filtering: pHGG-02_Normal_SVs_pass


In [101]:
### Save filtered controls ###

somatic_small_del_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/control_somatic/small_dels'
germline_small_del_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/control_germline/small_dels'

somatic_large_sv_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/control_somatic/large_svs'
germline_large_sv_path = '/Users/ryanyutian/Desktop/McGill_Normal_sv_processed/control_germline/large_svs'


os.chdir(somatic_small_del_path)

for dict_name in control_somatic_dict_names['dels']:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((somatic_small_del_path + '/' + dict_name + '.csv'), index=False, sep=',')

    
os.chdir(germline_small_del_path)

for dict_name in control_germline_dict_names['dels']:
    
    temp_df = pd.DataFrame.from_dict(globals()[dict_name])
    temp_df.to_csv((germline_small_del_path + '/' + dict_name + '.csv'), index=False, sep=',')

    
os.chdir(somatic_large_sv_path)

for DEL_dict_name in control_somatic_dict_names['large_svs']['DEL']:
    
    temp_DEL_df = pd.DataFrame.from_dict(globals()[DEL_dict_name])
    temp_DEL_df.to_csv((somatic_large_sv_path + '/DEL/' + DEL_dict_name + '.csv'), index=False, sep=',')
    
    temp_DUP_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'DUPs'])
    temp_DUP_df.to_csv((somatic_large_sv_path + '/DUP/' + DEL_dict_name[:-4] + 'DUPs' + '.csv'), index=False, sep=',')

    temp_INV_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'INVs'])
    temp_INV_df.to_csv((somatic_large_sv_path + '/INV/' + DEL_dict_name[:-4] + 'INVs' + '.csv'), index=False, sep=',')


os.chdir(germline_large_sv_path)

for DEL_dict_name in control_germline_dict_names['large_svs']['DEL']:
    
    temp_DEL_df = pd.DataFrame.from_dict(globals()[DEL_dict_name])
    temp_DEL_df.to_csv((germline_large_sv_path + '/DEL/' + DEL_dict_name + '.csv'), index=False, sep=',')
    
    temp_DUP_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'DUPs'])
    temp_DUP_df.to_csv((germline_large_sv_path + '/DUP/' + DEL_dict_name[:-4] + 'DUPs' + '.csv'), index=False, sep=',')

    temp_INV_df = pd.DataFrame.from_dict(globals()[DEL_dict_name[:-4] + 'INVs'])
    temp_INV_df.to_csv((germline_large_sv_path + '/INV/' + DEL_dict_name[:-4] + 'INVs' + '.csv'), index=False, sep=',')