In [1]:
# import necessary libraries for the analysis
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

In [2]:
## Functions used below are imported from another notebook, run before changing directory

%run overlap_and_range_functions.ipynb

In [3]:
gencode = pd.read_table("/Users/ryanyutian/Desktop/annotation_dataset/gencode.v43.annotation.gff3", comment="#",
                        sep = "\t", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])
gencode.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,chr1,HAVANA,gene,11869,14409,.,+,.,ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000290825.1;...
2,chr1,HAVANA,exon,11869,12227,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...
3,chr1,HAVANA,exon,12613,12721,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...
4,chr1,HAVANA,exon,13221,14409,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...


In [4]:
np.unique(gencode['seqname'])

array(['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr20', 'chr21',
       'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
       'chrM', 'chrX', 'chrY'], dtype=object)

In [5]:
np.unique(gencode['feature'])

array(['CDS', 'exon', 'five_prime_UTR', 'gene', 'start_codon',
       'stop_codon', 'stop_codon_redefined_as_selenocysteine',
       'three_prime_UTR', 'transcript'], dtype=object)

In [6]:
gencode.iloc[5]['attribute']

'ID=ENSG00000223972.6;gene_id=ENSG00000223972.6;gene_type=transcribed_unprocessed_pseudogene;gene_name=DDX11L1;level=2;hgnc_id=HGNC:37102;havana_gene=OTTHUMG00000000961.2'

In [7]:
def gene_info(x):
    
    if 'Parent' in x:
        
        parent = list(filter(lambda x: 'Parent' in x,  x.split(";")))[0].split("=")[1]
        
    else:
        
        parent = 'N/A'
        
    g_id = list(filter(lambda x: 'gene_id' in x,  x.split(";")))[0].split("=")[1]
    g_name = list(filter(lambda x: 'gene_name' in x,  x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,  x.split(";")))[0].split("=")[1]
    g_level = int(list(filter(lambda x: 'level' in x,  x.split(";")))[0].split("=")[1])
    
    return (parent, g_id, g_name, g_type, g_level)

In [8]:
gencode['parent'], gencode['gene_id'], gencode['gene_name'], gencode['gene_type'], gencode['gene_level'] = \
zip(*gencode.attribute.apply(lambda x: gene_info(x)))

gencode.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute,parent,gene_id,gene_name,gene_type,gene_level
0,chr1,HAVANA,gene,11869,14409,.,+,.,ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...,,ENSG00000290825.1,DDX11L2,lncRNA,2
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000290825.1;...,ENSG00000290825.1,ENSG00000290825.1,DDX11L2,lncRNA,2
2,chr1,HAVANA,exon,11869,12227,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...,ENST00000456328.2,ENSG00000290825.1,DDX11L2,lncRNA,2
3,chr1,HAVANA,exon,12613,12721,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...,ENST00000456328.2,ENSG00000290825.1,DDX11L2,lncRNA,2
4,chr1,HAVANA,exon,13221,14409,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...,ENST00000456328.2,ENSG00000290825.1,DDX11L2,lncRNA,2


In [9]:
gencode = gencode.rename(columns={'seqname': 'chr'})

In [10]:
np.unique(gencode['gene_type'])

array(['IG_C_gene', 'IG_C_pseudogene', 'IG_D_gene', 'IG_J_gene',
       'IG_J_pseudogene', 'IG_V_gene', 'IG_V_pseudogene', 'IG_pseudogene',
       'Mt_rRNA', 'Mt_tRNA', 'TEC', 'TR_C_gene', 'TR_D_gene', 'TR_J_gene',
       'TR_J_pseudogene', 'TR_V_gene', 'TR_V_pseudogene', 'artifact',
       'lncRNA', 'miRNA', 'misc_RNA', 'processed_pseudogene',
       'protein_coding', 'pseudogene', 'rRNA', 'rRNA_pseudogene',
       'ribozyme', 'sRNA', 'scRNA', 'scaRNA', 'snRNA', 'snoRNA',
       'transcribed_processed_pseudogene',
       'transcribed_unitary_pseudogene',
       'transcribed_unprocessed_pseudogene',
       'translated_processed_pseudogene',
       'translated_unprocessed_pseudogene', 'unitary_pseudogene',
       'unprocessed_pseudogene', 'vault_RNA'], dtype=object)

In [11]:
np.unique(gencode['feature'])

array(['CDS', 'exon', 'five_prime_UTR', 'gene', 'start_codon',
       'stop_codon', 'stop_codon_redefined_as_selenocysteine',
       'three_prime_UTR', 'transcript'], dtype=object)

In [12]:
gencode.head()

Unnamed: 0,chr,source,feature,start,end,score,strand,frame,attribute,parent,gene_id,gene_name,gene_type,gene_level
0,chr1,HAVANA,gene,11869,14409,.,+,.,ID=ENSG00000290825.1;gene_id=ENSG00000290825.1...,,ENSG00000290825.1,DDX11L2,lncRNA,2
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000290825.1;...,ENSG00000290825.1,ENSG00000290825.1,DDX11L2,lncRNA,2
2,chr1,HAVANA,exon,11869,12227,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...,ENST00000456328.2,ENSG00000290825.1,DDX11L2,lncRNA,2
3,chr1,HAVANA,exon,12613,12721,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...,ENST00000456328.2,ENSG00000290825.1,DDX11L2,lncRNA,2
4,chr1,HAVANA,exon,13221,14409,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...,ENST00000456328.2,ENSG00000290825.1,DDX11L2,lncRNA,2


In [13]:
overlap_func('chr1', 1200000, 1300000, 1, 'del', 'sample', gencode)

Unnamed: 0,chr,source,feature,start,end,score,strand,frame,attribute,parent,gene_id,gene_name,gene_type,gene_level,percent_overlap,input_start,input_end,input_ind,input_sv_type,input_sample_id
2558,chr1,HAVANA,gene,1203508,1206592,.,-,.,ID=ENSG00000186891.14;gene_id=ENSG00000186891....,,ENSG00000186891.14,TNFRSF18,protein_coding,2,100.00,1200000,1300000,1,del,sample
2559,chr1,HAVANA,transcript,1203508,1206571,.,-,.,ID=ENST00000328596.10;Parent=ENSG00000186891.1...,ENSG00000186891.14,ENSG00000186891.14,TNFRSF18,protein_coding,2,100.00,1200000,1300000,1,del,sample
2560,chr1,HAVANA,exon,1206385,1206571,.,-,.,ID=exon:ENST00000328596.10:1;Parent=ENST000003...,ENST00000328596.10,ENSG00000186891.14,TNFRSF18,protein_coding,2,100.00,1200000,1300000,1,del,sample
2561,chr1,HAVANA,CDS,1206385,1206571,.,-,0,ID=CDS:ENST00000328596.10;Parent=ENST000003285...,ENST00000328596.10,ENSG00000186891.14,TNFRSF18,protein_coding,2,100.00,1200000,1300000,1,del,sample
2562,chr1,HAVANA,start_codon,1206569,1206571,.,-,0,ID=start_codon:ENST00000328596.10;Parent=ENST0...,ENST00000328596.10,ENSG00000186891.14,TNFRSF18,protein_coding,2,100.00,1200000,1300000,1,del,sample
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,chr1,HAVANA,exon,1299973,1300068,.,-,.,ID=exon:ENST00000472541.5:8;Parent=ENST0000047...,ENST00000472541.5,ENSG00000131584.19,ACAP3,protein_coding,2,29.17,1200000,1300000,1,del,sample
3610,chr1,HAVANA,exon,1298744,1299905,.,-,.,ID=exon:ENST00000472541.5:9;Parent=ENST0000047...,ENST00000472541.5,ENSG00000131584.19,ACAP3,protein_coding,2,100.00,1200000,1300000,1,del,sample
3647,chr1,ENSEMBL,gene,1296110,1296170,.,-,.,ID=ENSG00000278073.1;gene_id=ENSG00000278073.1...,,ENSG00000278073.1,MIR6726,miRNA,3,100.00,1200000,1300000,1,del,sample
3648,chr1,ENSEMBL,transcript,1296110,1296170,.,-,.,ID=ENST00000613751.1;Parent=ENSG00000278073.1;...,ENSG00000278073.1,ENSG00000278073.1,MIR6726,miRNA,3,100.00,1200000,1300000,1,del,sample


# Small dels

In [14]:
### Load filtered somatic small deletions ###

##
somatic_small_del_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel_final/small_dels'

somatic_small_del_wo_highoverlap_df_names = []

os.chdir(somatic_small_del_path)
temp_files = sorted([i for i in os.listdir(somatic_small_del_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_small_del_wo_highoverlap_df_names.append(file_name[:-4])

In [15]:
small_del_gene_list_names = []

for small_del_df_name in somatic_small_del_wo_highoverlap_df_names:
    
    print('Currently analyzing: ' + small_del_df_name[:-41])
    temp_df = globals()[small_del_df_name]
    temp_output_df = pd.DataFrame()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap = overlap_func(row['CHROM'], row['POS'], row['END'], index, \
                                    'del', small_del_df_name[:-41], gencode)
        temp_output_df = temp_output_df.append(temp_overlap)
    
    temp_gene_list_name = small_del_df_name[:-41] + '_small_del_gene_list'
    small_del_gene_list_names.append(temp_gene_list_name)
    
    globals()[temp_gene_list_name] = temp_output_df

Currently analyzing: A_RR_GBM809
Currently analyzing: A_R_GBM607
Currently analyzing: B_P_GBM593
Currently analyzing: B_R_GBM898
Currently analyzing: C_P_GBM577
Currently analyzing: C_R_GBM625
Currently analyzing: E_RR_GBM937
Currently analyzing: E_R_GBM781
Currently analyzing: F_P_GBM620
Currently analyzing: F_R_GBM691
Currently analyzing: G_P_GBM454
Currently analyzing: G_R_GBM833
Currently analyzing: H_P_GBM460
Currently analyzing: H_R_GBM492
Currently analyzing: I_P_GBM440
Currently analyzing: I_R_GBM532
Currently analyzing: J_P_GBM401
Currently analyzing: J_RR_GBM551
Currently analyzing: J_R_GBM498
Currently analyzing: K_P_GBM529
Currently analyzing: K_R_GBM832
Currently analyzing: L_P_GBM618
Currently analyzing: L_R_SMTB152
Currently analyzing: M_P_GBM672
Currently analyzing: M_R_GBM828
Currently analyzing: N_P_BT2013110
Currently analyzing: N_R_GBM745
Currently analyzing: O_P_GBM703
Currently analyzing: O_R_SMTB781
Currently analyzing: P_P_SMTB123
Currently analyzing: P_R_SMTB26

In [16]:
small_del_gene_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/all/small_dels/'

for df_name in small_del_gene_list_names:
    
    temp_df = globals()[df_name]
    temp_df.to_csv((small_del_gene_list_path + df_name + '.csv'), index=False, sep=',')

# Large dels

In [33]:
## DELs
somatic_large_DEL_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel_final/large_svs/DEL'

somatic_large_DEL_wo_highoverlap_df_names = []

os.chdir(somatic_large_DEL_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DEL_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_DEL_wo_highoverlap_df_names.append(file_name[:-4])

In [34]:
large_del_gene_list_names = []

for large_del_df_name in somatic_large_DEL_wo_highoverlap_df_names:
    
    print('Currently analyzing: ' + large_del_df_name[:-44])
    temp_df = globals()[large_del_df_name]
    temp_output_df = pd.DataFrame()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap = overlap_func(row['CHROM'], row['POS'], row['END'], index, \
                                    'DEL', large_del_df_name[:-44], gencode)
        temp_output_df = temp_output_df.append(temp_overlap)
    
    temp_gene_list_name = large_del_df_name[:-44] + '_large_del_gene_list'
    large_del_gene_list_names.append(temp_gene_list_name)
    
    globals()[temp_gene_list_name] = temp_output_df

Currently analyzing: A_RR_GBM809
Currently analyzing: A_R_GBM607
Currently analyzing: B_P_GBM593
Currently analyzing: B_R_GBM898
Currently analyzing: C_P_GBM577
Currently analyzing: C_R_GBM625
Currently analyzing: E_RR_GBM937
Currently analyzing: E_R_GBM781
Currently analyzing: F_P_GBM620
Currently analyzing: F_R_GBM691
Currently analyzing: G_P_GBM454
Currently analyzing: G_R_GBM833
Currently analyzing: H_P_GBM460
Currently analyzing: H_R_GBM492
Currently analyzing: I_P_GBM440
Currently analyzing: I_R_GBM532
Currently analyzing: J_P_GBM401
Currently analyzing: J_RR_GBM551
Currently analyzing: J_R_GBM498
Currently analyzing: K_P_GBM529
Currently analyzing: K_R_GBM832
Currently analyzing: L_P_GBM618
Currently analyzing: L_R_SMTB152
Currently analyzing: M_P_GBM672
Currently analyzing: M_R_GBM828
Currently analyzing: N_P_BT2013110
Currently analyzing: N_R_GBM745
Currently analyzing: O_P_GBM703
Currently analyzing: O_R_SMTB781
Currently analyzing: P_P_SMTB123
Currently analyzing: P_R_SMTB26

In [35]:
large_del_gene_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/all/large_svs/DEL/'

for df_name in large_del_gene_list_names:
    
    temp_df = globals()[df_name]
    temp_df.to_csv((large_del_gene_list_path + df_name + '.csv'), index=False, sep=',')

# Large dups

In [36]:
## DUPs
somatic_large_DUP_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel_final/large_svs/DUP'

somatic_large_DUP_wo_highoverlap_df_names = []

os.chdir(somatic_large_DUP_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DUP_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_DUP_wo_highoverlap_df_names.append(file_name[:-4])

In [37]:
large_dup_gene_list_names = []

for large_dup_df_name in somatic_large_DUP_wo_highoverlap_df_names:
    
    print('Currently analyzing: ' + large_dup_df_name[:-44])
    temp_df = globals()[large_dup_df_name]
    temp_output_df = pd.DataFrame()
   
    for index, row in temp_df.iterrows():
        
        temp_overlap = overlap_func(row['CHROM'], row['POS'], row['END'], index, \
                                    'DUP', large_dup_df_name[:-44], gencode)
        temp_output_df = temp_output_df.append(temp_overlap)
    
    temp_gene_list_name = large_dup_df_name[:-44] + '_large_dup_gene_list'
    large_dup_gene_list_names.append(temp_gene_list_name)
    
    globals()[temp_gene_list_name] = temp_output_df

Currently analyzing: A_RR_GBM809
Currently analyzing: A_R_GBM607
Currently analyzing: B_P_GBM593
Currently analyzing: B_R_GBM898
Currently analyzing: C_P_GBM577
Currently analyzing: C_R_GBM625
Currently analyzing: E_RR_GBM937
Currently analyzing: E_R_GBM781
Currently analyzing: F_P_GBM620
Currently analyzing: F_R_GBM691
Currently analyzing: G_P_GBM454
Currently analyzing: G_R_GBM833
Currently analyzing: H_P_GBM460
Currently analyzing: H_R_GBM492
Currently analyzing: I_P_GBM440
Currently analyzing: I_R_GBM532
Currently analyzing: J_P_GBM401
Currently analyzing: J_RR_GBM551
Currently analyzing: J_R_GBM498
Currently analyzing: K_P_GBM529
Currently analyzing: K_R_GBM832
Currently analyzing: L_P_GBM618
Currently analyzing: L_R_SMTB152
Currently analyzing: M_P_GBM672
Currently analyzing: M_R_GBM828
Currently analyzing: N_P_BT2013110
Currently analyzing: N_R_GBM745
Currently analyzing: O_P_GBM703
Currently analyzing: O_R_SMTB781
Currently analyzing: P_P_SMTB123
Currently analyzing: P_R_SMTB26

In [38]:
large_dup_gene_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/all/large_svs/DUP/'

for df_name in large_dup_gene_list_names:
    
    temp_df = globals()[df_name]
    temp_df.to_csv((large_dup_gene_list_path + df_name + '.csv'), index=False, sep=',')

# Large invs

In [39]:
## INVs
somatic_large_INV_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_glioma_sv_processed/somatic_panel_final/large_svs/INV'

somatic_large_INV_wo_highoverlap_df_names = []

os.chdir(somatic_large_INV_path)
temp_files = sorted([i for i in os.listdir(somatic_large_INV_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_INV_wo_highoverlap_df_names.append(file_name[:-4])

In [40]:
large_inv_gene_list_names = []

for large_inv_df_name in somatic_large_INV_wo_highoverlap_df_names:
    
    print('Currently analyzing: ' + large_inv_df_name[:-44])
    temp_df = globals()[large_inv_df_name]
    temp_output_df = pd.DataFrame()
    
    for index, row in temp_df.iterrows():
        
        temp_overlap = overlap_func(row['CHROM'], row['POS'], row['END'], index, \
                                    'INV', large_inv_df_name[:-44], gencode)
        temp_output_df = temp_output_df.append(temp_overlap)
    
    temp_gene_list_name = large_inv_df_name[:-44] + '_large_inv_gene_list'
    large_inv_gene_list_names.append(temp_gene_list_name)
    
    globals()[temp_gene_list_name] = temp_output_df

Currently analyzing: A_RR_GBM809
Currently analyzing: A_R_GBM607
Currently analyzing: B_P_GBM593
Currently analyzing: B_R_GBM898
Currently analyzing: C_P_GBM577
Currently analyzing: C_R_GBM625
Currently analyzing: E_RR_GBM937
Currently analyzing: E_R_GBM781
Currently analyzing: F_P_GBM620
Currently analyzing: F_R_GBM691
Currently analyzing: G_P_GBM454
Currently analyzing: G_R_GBM833
Currently analyzing: H_P_GBM460
Currently analyzing: H_R_GBM492
Currently analyzing: I_P_GBM440
Currently analyzing: I_R_GBM532
Currently analyzing: J_P_GBM401
Currently analyzing: J_RR_GBM551
Currently analyzing: J_R_GBM498
Currently analyzing: K_P_GBM529
Currently analyzing: K_R_GBM832
Currently analyzing: L_P_GBM618
Currently analyzing: L_R_SMTB152
Currently analyzing: M_P_GBM672
Currently analyzing: M_R_GBM828
Currently analyzing: N_P_BT2013110
Currently analyzing: N_R_GBM745
Currently analyzing: O_P_GBM703
Currently analyzing: O_R_SMTB781
Currently analyzing: P_P_SMTB123
Currently analyzing: P_R_SMTB26

In [41]:
large_inv_gene_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/all/large_svs/INV/'

for df_name in large_inv_gene_list_names:
    
    temp_df = globals()[df_name]
    temp_df.to_csv((large_inv_gene_list_path + df_name + '.csv'), index=False, sep=',')

# COMBINED

In [None]:
'''

combined_gene_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/unique_gene_only_annotated/combined/'

for df_name in small_del_gene_list_names:
    
    temp_df = globals()[df_name].copy()
    temp_df = temp_df.append(globals()[df_name[:-20]+'_large_del_gene_list'], ignore_index=True)
    temp_df = temp_df.append(globals()[df_name[:-20]+'_large_dup_gene_list'], ignore_index=True)
    temp_df = temp_df.append(globals()[df_name[:-20]+'_large_inv_gene_list'], ignore_index=True)

    temp_df.to_csv((combined_gene_list_path + df_name[:-20] + '.csv'), index=False, sep=',')
    
'''

# GENE ONLY

In [17]:
small_del_gene_only_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/gene_only/small_dels/'

for df_name in small_del_gene_list_names:
    
    temp_df = globals()[df_name]
    temp_df_gene_only = temp_df[temp_df['feature'] == 'gene']
    
    temp_df_gene_only.to_csv((small_del_gene_only_list_path + df_name[:-20] + '_small_del_gene_list.csv'), index=False, sep=',')

In [42]:
large_del_gene_only_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/gene_only/large_svs/DEL/'

for df_name in large_del_gene_list_names:
    
    temp_df = globals()[df_name]

    if len(temp_df) != 0:
        
        temp_df_gene_only = temp_df[temp_df['feature'] == 'gene']
    
    else:
        
        temp_df_gene_only = temp_df

    
    temp_df_gene_only.to_csv((large_del_gene_only_list_path + df_name[:-20] + '_large_del_gene_list.csv'), index=False, sep=',')

In [43]:
large_dup_gene_only_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/gene_only/large_svs/DUP/'

for df_name in large_dup_gene_list_names:

    temp_df = globals()[df_name]

    if len(temp_df) != 0:
        
        temp_df_gene_only = temp_df[temp_df['feature'] == 'gene']
    
    else:
        
        temp_df_gene_only = temp_df

    
    temp_df_gene_only.to_csv((large_dup_gene_only_list_path + df_name[:-20] + '_large_dup_gene_list.csv'), index=False, sep=',')

In [44]:
large_inv_gene_only_list_path = '/Users/ryanyutian/Desktop/new_panel_TRI_Brain_gene_list/gene_only/large_svs/INV/'

for df_name in large_inv_gene_list_names:
    
    temp_df = globals()[df_name]
    
    if len(temp_df) != 0:
        
        temp_df_gene_only = temp_df[temp_df['feature'] == 'gene']
    
    else:
        
        temp_df_gene_only = temp_df
        
    temp_df_gene_only.to_csv((large_inv_gene_only_list_path + df_name[:-20] + '_large_inv_gene_list.csv'), index=False, sep=',')

# INSPECTION

In [33]:
gencode[gencode['gene_name'].str.contains('ERBB4')]

Unnamed: 0,chr,source,feature,start,end,score,strand,frame,attribute,parent,gene_id,gene_name,gene_type,gene_level
508754,chr2,HAVANA,gene,211375717,212538841,.,-,.,ID=ENSG00000178568.15;gene_id=ENSG00000178568....,,ENSG00000178568.15,ERBB4,protein_coding,2
508755,chr2,ENSEMBL,transcript,211375717,212124907,.,-,.,ID=ENST00000402597.6;Parent=ENSG00000178568.15...,ENSG00000178568.15,ENSG00000178568.15,ERBB4,protein_coding,3
508756,chr2,ENSEMBL,exon,212124752,212124907,.,-,.,ID=exon:ENST00000402597.6:1;Parent=ENST0000040...,ENST00000402597.6,ENSG00000178568.15,ERBB4,protein_coding,3
508757,chr2,ENSEMBL,CDS,212124752,212124907,.,-,0,ID=CDS:ENST00000402597.6;Parent=ENST0000040259...,ENST00000402597.6,ENSG00000178568.15,ERBB4,protein_coding,3
508758,chr2,ENSEMBL,exon,211947430,211947616,.,-,.,ID=exon:ENST00000402597.6:2;Parent=ENST0000040...,ENST00000402597.6,ENSG00000178568.15,ERBB4,protein_coding,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509011,chr2,HAVANA,five_prime_UTR,212429159,212429247,.,-,.,ID=UTR5:ENST00000435846.1;Parent=ENST000004358...,ENST00000435846.1,ENSG00000178568.15,ERBB4,protein_coding,2
509012,chr2,HAVANA,five_prime_UTR,212124809,212124903,.,-,.,ID=UTR5:ENST00000435846.1;Parent=ENST000004358...,ENST00000435846.1,ENSG00000178568.15,ERBB4,protein_coding,2
509013,chr2,HAVANA,transcript,212124584,212125181,.,-,.,ID=ENST00000459774.1;Parent=ENSG00000178568.15...,ENSG00000178568.15,ENSG00000178568.15,ERBB4,protein_coding,2
509014,chr2,HAVANA,exon,212125058,212125181,.,-,.,ID=exon:ENST00000459774.1:1;Parent=ENST0000045...,ENST00000459774.1,ENSG00000178568.15,ERBB4,protein_coding,2


In [34]:
A_R_GBM607_df = A_R_GBM607_small_del_gene_list.copy()
A_R_GBM607_df = A_R_GBM607_df.append(A_R_GBM607_large_del_gene_list, ignore_index=True)
A_R_GBM607_df = A_R_GBM607_df.append(A_R_GBM607_large_dup_gene_list, ignore_index=True)
A_R_GBM607_df = A_R_GBM607_df.append(A_R_GBM607_large_inv_gene_list, ignore_index=True)

In [35]:
A_R_GBM607_df_gene_only = A_R_GBM607_df[A_R_GBM607_df['feature'] == 'gene']

In [36]:
A_R_GBM607_df[A_R_GBM607_df['gene_name'].str.contains('ERBB4')]

Unnamed: 0,chr,source,feature,start,end,score,strand,frame,attribute,parent,gene_id,gene_name,gene_type,gene_level,percent_overlap,input_start,input_end,input_ind,input_sv_type,input_sample_id
