In [1]:
import pycircos
import collections
import matplotlib.pyplot as plt

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_rows = 200

In [4]:
from intervaltree import Interval, IntervalTree

def collapse_intervals(df):
    # Create an interval tree from DataFrame intervals, incrementing end point by 1
    tree = IntervalTree(Interval(row.POS, row.END + 1, 1) for row in df.itertuples())

    # Merge overlapping intervals in the tree
    tree.merge_overlaps()

    # Re-create DataFrame from merged intervals, decrementing end point by 1
    collapsed_df = pd.DataFrame([(iv.begin, iv.end - 1) for iv in tree], columns=['POS', 'END'])

    return collapsed_df

def heatmap_data(dataframes, chrom):
    # Collapse intervals in each DataFrame and combine all into one
    combined_df = pd.concat([collapse_intervals(df) for df in dataframes])
    
    if len(combined_df) == 0:
        return pd.DataFrame(columns=['chr', 'start', 'end', 'value'])
    
    # Create an array for the whole range
    min_start = combined_df['POS'].min()
    max_end = combined_df['END'].max()
    all_range = np.zeros(max_end - min_start + 1)
    
    # Iterate through each range and increment count
    for _, row in combined_df.iterrows():
        all_range[row['POS'] - min_start : row['END'] - min_start + 1] += 1
    
    # Create the output data
    output_data = []
    current_value = all_range[0]
    start_range = min_start
    for i in range(1, len(all_range)):
        if all_range[i] != current_value:
            output_data.append((chrom, start_range, min_start + i - 1, current_value))
            start_range = min_start + i
            current_value = all_range[i]
    
    # Add the last range
    output_data.append((chrom, start_range, max_end, current_value))
    
    # Convert to DataFrame
    output_df = pd.DataFrame(output_data, columns=['chr', 'start', 'end', 'value'])
    
    return output_df


In [5]:
chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', \
               'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', \
               'chr21', 'chr22', 'chrX', 'chrY']

## Large deletions

In [6]:
### Load filtered somatic large deletions ###

##
somatic_large_DEL_path = '/Users/ryanyutian/Desktop/Manuscript/filtered_sv/DEL'

somatic_large_DEL_filtered_df_names = []

os.chdir(somatic_large_DEL_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DEL_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_DEL_filtered_df_names.append(file_name[:-4])

In [7]:
somatic_large_DEL_filtered_df_names

['A_RR_GBM809_DEL',
 'A_R_GBM607_DEL',
 'B_P_GBM593_DEL',
 'B_R_GBM898_DEL',
 'C_RR_GBM937_DEL',
 'C_R_GBM781_DEL',
 'D_P_GBM620_DEL',
 'D_R_GBM691_DEL',
 'E_P_GBM454_DEL',
 'E_R_GBM833_DEL',
 'F_P_GBM460_DEL',
 'F_R_GBM492_DEL',
 'G_P_GBM401_DEL',
 'G_RR_GBM551_DEL',
 'G_R_GBM498_DEL',
 'H_P_GBM529_DEL',
 'H_R_GBM832_DEL',
 'I_P_BT2013110_DEL',
 'I_R_GBM745_DEL',
 'J_P_GBM703_DEL',
 'J_R_SMTB781_DEL',
 'X_P_GBM440_DEL',
 'X_P_GBM577_DEL',
 'X_P_GBM618_DEL',
 'X_P_GBM672_DEL',
 'X_P_SMTB123_DEL',
 'X_R_GBM945_DEL',
 'X_R_SMTB135_DEL',
 'X_R_SMTB241_DEL',
 'X_R_SMTB302_DEL',
 'X_R_SMTB814_DEL']

In [10]:
CIRCOS_indv_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/individual_data/DEL/'
non_zero_ext = 'nonzero/'

for df_name in somatic_large_DEL_filtered_df_names:
    
    print('Currently analyzing: ' + df_name[:-4])
    temp_df = globals()[df_name]
    
    temp_output_large_DEL_heatmap = pd.DataFrame(columns=['chr', 'start', 'end', 'value'])
    
    for chrom in chromosomes:
        
        temp_matching_chrom_df_list = [temp_df[temp_df['CHROM']==chrom]]

        if len(temp_df[temp_df['CHROM']==chrom]) == 0:
            
            temp_output_large_DEL_heatmap.loc[len(temp_output_large_DEL_heatmap)] = [chrom, 0, 1, 0.01]
            
        else:
            
            temp_output_df = heatmap_data(temp_matching_chrom_df_list, chrom)

            temp_output_large_DEL_heatmap = \
            pd.concat([temp_output_large_DEL_heatmap.reset_index(drop=True), temp_output_df.reset_index(drop=True)], ignore_index=True)
        
    print('max: ' + str(max(temp_output_large_DEL_heatmap['value'])))
    
    temp_output_large_DEL_heatmap.to_csv((CIRCOS_indv_sample_path + \
                                          df_name[:-4] + '_large_DEL_heatmap_raw.csv'), index=False, sep=',')
    
    temp_output_large_DEL_heatmap_nonzero = temp_output_large_DEL_heatmap[temp_output_large_DEL_heatmap['value']!=0]
    
    temp_output_large_DEL_heatmap_nonzero.to_csv((CIRCOS_indv_sample_path + non_zero_ext + \
                                                  df_name[:-4] + '_large_DEL_heatmap_nonzero.csv'), \
                                                  index=False, sep=',')

Currently analyzing: A_RR_GBM809
max: 1.0
Currently analyzing: A_R_GBM607
max: 1.0
Currently analyzing: B_P_GBM593
max: 1.0
Currently analyzing: B_R_GBM898
max: 1.0
Currently analyzing: C_RR_GBM937
max: 1.0
Currently analyzing: C_R_GBM781
max: 1.0
Currently analyzing: D_P_GBM620
max: 1.0
Currently analyzing: D_R_GBM691
max: 1.0
Currently analyzing: E_P_GBM454
max: 1.0
Currently analyzing: E_R_GBM833
max: 1.0
Currently analyzing: F_P_GBM460
max: 1.0
Currently analyzing: F_R_GBM492
max: 1.0
Currently analyzing: G_P_GBM401
max: 1.0
Currently analyzing: G_RR_GBM551
max: 1.0
Currently analyzing: G_R_GBM498
max: 1.0
Currently analyzing: H_P_GBM529
max: 1.0
Currently analyzing: H_R_GBM832
max: 1.0
Currently analyzing: I_P_BT2013110
max: 1.0
Currently analyzing: I_R_GBM745
max: 1.0
Currently analyzing: J_P_GBM703
max: 1.0
Currently analyzing: J_R_SMTB781
max: 1.0
Currently analyzing: X_P_GBM440
max: 1.0
Currently analyzing: X_P_GBM577
max: 1.0
Currently analyzing: X_P_GBM618
max: 1.0
Currently

## Large duplications

In [11]:
### Load filtered somatic large duplications ###

##
somatic_large_DUP_path = '/Users/ryanyutian/Desktop/Manuscript/filtered_sv/DUP'

somatic_large_DUP_filtered_df_names = []

os.chdir(somatic_large_DUP_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DUP_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_DUP_filtered_df_names.append(file_name[:-4])

In [12]:
CIRCOS_indv_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/individual_data/DUP/'
non_zero_ext = 'nonzero/'

for df_name in somatic_large_DUP_filtered_df_names:
    
    print('Currently analyzing: ' + df_name[:-44])
    temp_df = globals()[df_name]
    
    temp_output_large_DUP_heatmap = pd.DataFrame(columns=['chr', 'start', 'end', 'value'])
    
    for chrom in chromosomes:
        
        temp_matching_chrom_df_list = [temp_df[temp_df['CHROM']==chrom]]

        if len(temp_df[temp_df['CHROM']==chrom]) == 0:
            
            temp_output_large_DUP_heatmap.loc[len(temp_output_large_DUP_heatmap)] = [chrom, 0, 1, 0.01]
            
        else:
            
            temp_output_df = heatmap_data(temp_matching_chrom_df_list, chrom)

            temp_output_large_DUP_heatmap = \
            pd.concat([temp_output_large_DUP_heatmap.reset_index(drop=True), temp_output_df.reset_index(drop=True)], ignore_index=True)
        
    print('max: ' + str(max(temp_output_large_DUP_heatmap['value'])))
    
    condition = ~((temp_output_large_DUP_heatmap['chr'] == 'chr12') & 
                  (temp_output_large_DUP_heatmap['start'] >= 17648133) & 
                  (temp_output_large_DUP_heatmap['end'] <= 17987890))
    temp_output_large_DUP_heatmap = temp_output_large_DUP_heatmap[condition].copy()
    temp_output_large_DUP_heatmap = temp_output_large_DUP_heatmap.reset_index(drop=True)
    
    temp_output_large_DUP_heatmap.loc[len(temp_output_large_DUP_heatmap)] = ['chr12', 0, 1, 0.01]
    
    temp_output_large_DUP_heatmap.to_csv((CIRCOS_indv_sample_path + \
                                          df_name[:-44] + '_large_DUP_heatmap_raw.csv'), index=False, sep=',')
    
    temp_output_large_DUP_heatmap_nonzero = temp_output_large_DUP_heatmap[temp_output_large_DUP_heatmap['value']!=0]
    
    temp_output_large_DUP_heatmap_nonzero.to_csv((CIRCOS_indv_sample_path + non_zero_ext + \
                                                  df_name[:-44] + '_large_DUP_heatmap_nonzero.csv'), \
                                                  index=False, sep=',')

Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0
Currently analyzing: 
max: 1.0


In [13]:
temp_output_large_DUP_heatmap_nonzero

Unnamed: 0,chr,start,end,value
0,chr1,65046886,65106122,1.0
2,chr1,70881265,71060288,1.0
4,chr1,121830000,121940000,1.0
5,chr2,0,1,0.01
6,chr3,50134339,50196563,1.0
8,chr3,128661552,128696094,1.0
9,chr4,52000000,52590000,1.0
11,chr4,54220000,54690000,1.0
12,chr5,56490016,56810352,1.0
14,chr5,142413123,142455021,1.0
