In [1]:
import pycircos
import collections
import matplotlib.pyplot as plt

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_rows = 200

In [4]:
from intervaltree import Interval, IntervalTree

def collapse_intervals(df):
    # Create an interval tree from DataFrame intervals, incrementing end point by 1
    tree = IntervalTree(Interval(row.POS, row.END + 1, 1) for row in df.itertuples())

    # Merge overlapping intervals in the tree
    tree.merge_overlaps()

    # Re-create DataFrame from merged intervals, decrementing end point by 1
    collapsed_df = pd.DataFrame([(iv.begin, iv.end - 1) for iv in tree], columns=['POS', 'END'])

    return collapsed_df

def heatmap_data(dataframes, chrom):
    # Collapse intervals in each DataFrame and combine all into one
    combined_df = pd.concat([collapse_intervals(df) for df in dataframes])
    
    if len(combined_df) == 0:
        return pd.DataFrame(columns=['chr', 'start', 'end', 'value'])
    
    # Create an array for the whole range
    min_start = combined_df['POS'].min()
    max_end = combined_df['END'].max()
    all_range = np.zeros(max_end - min_start + 1)
    
    # Iterate through each range and increment count
    for _, row in combined_df.iterrows():
        all_range[row['POS'] - min_start : row['END'] - min_start + 1] += 1
    
    # Create the output data
    output_data = []
    current_value = all_range[0]
    start_range = min_start
    for i in range(1, len(all_range)):
        if all_range[i] != current_value:
            output_data.append((chrom, start_range, min_start + i - 1, current_value))
            start_range = min_start + i
            current_value = all_range[i]
    
    # Add the last range
    output_data.append((chrom, start_range, max_end, current_value))
    
    # Convert to DataFrame
    output_df = pd.DataFrame(output_data, columns=['chr', 'start', 'end', 'value'])
    
    return output_df


In [5]:
chromosomes = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', \
               'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', \
               'chr21', 'chr22', 'chrX', 'chrY']

## Large deletions

In [6]:
### Load filtered somatic large deletions ###

##
somatic_large_DEL_path = '/Users/ryanyutian/Desktop/Manuscript/filtered_sv/DEL'

somatic_large_DEL_filtered_df_names = []

os.chdir(somatic_large_DEL_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DEL_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_DEL_filtered_df_names.append(file_name[:-4])

In [7]:
somatic_large_DEL_filtered_df_names

['A_RR_GBM809_DEL',
 'A_R_GBM607_DEL',
 'B_P_GBM593_DEL',
 'B_R_GBM898_DEL',
 'C_RR_GBM937_DEL',
 'C_R_GBM781_DEL',
 'D_P_GBM620_DEL',
 'D_R_GBM691_DEL',
 'E_P_GBM454_DEL',
 'E_R_GBM833_DEL',
 'F_P_GBM460_DEL',
 'F_R_GBM492_DEL',
 'G_P_GBM401_DEL',
 'G_RR_GBM551_DEL',
 'G_R_GBM498_DEL',
 'H_P_GBM529_DEL',
 'H_R_GBM832_DEL',
 'I_P_BT2013110_DEL',
 'I_R_GBM745_DEL',
 'J_P_GBM703_DEL',
 'J_R_SMTB781_DEL',
 'X_P_GBM440_DEL',
 'X_P_GBM577_DEL',
 'X_P_GBM618_DEL',
 'X_P_GBM672_DEL',
 'X_P_SMTB123_DEL',
 'X_R_GBM945_DEL',
 'X_R_SMTB135_DEL',
 'X_R_SMTB241_DEL',
 'X_R_SMTB302_DEL',
 'X_R_SMTB814_DEL']

In [8]:
## Primary

primary_large_DEL_heatmap = pd.DataFrame(columns=['chr', 'start', 'end', 'value'])

for chrom in chromosomes:
    
    temp_matching_chrom_df_list = [globals()[i][globals()[i]['CHROM']==chrom] \
                                   for i in somatic_large_DEL_filtered_df_names if i.split('_')[1]=='P']
    temp_output_df = heatmap_data(temp_matching_chrom_df_list, chrom)
    
    primary_large_DEL_heatmap = pd.concat([primary_large_DEL_heatmap.reset_index(drop=True), temp_output_df.reset_index(drop=True)], ignore_index=True)

In [9]:
primary_large_DEL_heatmap

Unnamed: 0,chr,start,end,value
0,chr1,690000,2650000,1.0
1,chr1,2650001,4013220,0.0
2,chr1,4013221,4188641,1.0
3,chr1,4188642,13369999,0.0
4,chr1,13370000,16510000,1.0
...,...,...,...,...
394,chrX,155825948,155853816,1.0
395,chrX,155853817,155858906,2.0
396,chrX,155858907,155985002,3.0
397,chrX,155985003,155985024,2.0


In [10]:
max(primary_large_DEL_heatmap['value'])

8.0

In [11]:
len([i for i in somatic_large_DEL_filtered_df_names if i.split('_')[1]=='P'])

13

In [12]:
CIRCOS_primary_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/primary'

primary_large_DEL_heatmap.to_csv((CIRCOS_primary_sample_path + 'large_DEL_heatmap_primary_sample_raw.csv'), index=False, sep=',')

In [13]:
primary_large_DEL_heatmap_nonzero = primary_large_DEL_heatmap[(primary_large_DEL_heatmap['value']!=0)]

In [14]:
CIRCOS_primary_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/primary'

primary_large_DEL_heatmap_nonzero.to_csv((CIRCOS_primary_sample_path + 'large_DEL_heatmap_primary_sample_nonzero.csv'), index=False, sep=',')

In [15]:
primary_large_DEL_heatmap_percentage = primary_large_DEL_heatmap_nonzero.copy()
primary_large_DEL_heatmap_percentage['value'] = primary_large_DEL_heatmap_percentage['value']/13

In [16]:
CIRCOS_primary_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/primary'

primary_large_DEL_heatmap_percentage.to_csv((CIRCOS_primary_sample_path + 'large_DEL_heatmap_primary_sample_percent.csv'), index=False, sep=',')

In [17]:
## Recurrent

recurrent_large_DEL_heatmap = pd.DataFrame(columns=['chr', 'start', 'end', 'value'])

for chrom in chromosomes:
    
    temp_matching_chrom_df_list = [globals()[i][globals()[i]['CHROM']==chrom] \
                                   for i in somatic_large_DEL_filtered_df_names if i.split('_')[1]!='P']
    temp_output_df = heatmap_data(temp_matching_chrom_df_list, chrom)
    
    recurrent_large_DEL_heatmap = pd.concat([recurrent_large_DEL_heatmap.reset_index(drop=True), temp_output_df.reset_index(drop=True)], ignore_index=True)

In [18]:
recurrent_large_DEL_heatmap

Unnamed: 0,chr,start,end,value
0,chr1,690000,1650000,1.0
1,chr1,1650001,1739999,0.0
2,chr1,1740000,2650000,1.0
3,chr1,2650001,5959999,0.0
4,chr1,5960000,6270000,1.0
...,...,...,...,...
1256,chrX,155984108,155984222,5.0
1257,chrX,155984223,155984254,4.0
1258,chrX,155984255,155984259,3.0
1259,chrX,155984260,155984263,2.0


In [19]:
max(recurrent_large_DEL_heatmap['value'])

8.0

In [20]:
len([i for i in somatic_large_DEL_filtered_df_names if i.split('_')[1]!='P'])

18

In [21]:
CIRCOS_recurrent_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/recurrent/'

recurrent_large_DEL_heatmap.to_csv((CIRCOS_recurrent_sample_path + 'large_DEL_heatmap_recurrent_sample_raw.csv'), index=False, sep=',')

In [22]:
recurrent_large_DEL_heatmap_nonzero = recurrent_large_DEL_heatmap[(recurrent_large_DEL_heatmap['value']!=0)]

In [23]:
CIRCOS_recurrent_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/recurrent/'

recurrent_large_DEL_heatmap_nonzero.to_csv((CIRCOS_recurrent_sample_path + 'large_DEL_heatmap_recurrent_sample_nonzero.csv'), index=False, sep=',')

In [24]:
recurrent_large_DEL_heatmap_percentage = recurrent_large_DEL_heatmap_nonzero.copy()
recurrent_large_DEL_heatmap_percentage['value'] = recurrent_large_DEL_heatmap_percentage['value']/23

In [25]:
CIRCOS_recurrent_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/recurrent/'

recurrent_large_DEL_heatmap_percentage.to_csv((CIRCOS_recurrent_sample_path + 'large_DEL_heatmap_recurrent_sample_percent.csv'), index=False, sep=',')

## Large duplications

In [26]:
### Load filtered somatic large duplications ###

##
somatic_large_DUP_path = '/Users/ryanyutian/Desktop/Manuscript/filtered_sv/DUP'

somatic_large_DUP_filtered_df_names = []

os.chdir(somatic_large_DUP_path)
temp_files = sorted([i for i in os.listdir(somatic_large_DUP_path) if 'DS' not in i])

for file_name in temp_files:
    
    globals()[file_name[:-4]] = pd.read_csv(file_name)
    somatic_large_DUP_filtered_df_names.append(file_name[:-4])

In [27]:
## Primary

primary_large_DUP_heatmap = pd.DataFrame(columns=['chr', 'start', 'end', 'value'])

for chrom in chromosomes:
    
    temp_matching_chrom_df_list = [globals()[i][globals()[i]['CHROM']==chrom] \
                                   for i in somatic_large_DUP_filtered_df_names if i.split('_')[1]=='P']
    temp_output_df = heatmap_data(temp_matching_chrom_df_list, chrom)
    
    primary_large_DUP_heatmap = pd.concat([primary_large_DUP_heatmap.reset_index(drop=True), temp_output_df.reset_index(drop=True)], ignore_index=True)

In [28]:
primary_large_DUP_heatmap

Unnamed: 0,chr,start,end,value
0,chr1,1650000,1730000,3.0
1,chr1,1730001,1740000,2.0
2,chr1,1740001,3976314,0.0
3,chr1,3976315,4006403,1.0
4,chr1,4006404,13099999,0.0
...,...,...,...,...
725,chrX,155311266,155336689,2.0
726,chrX,155336690,155470374,1.0
727,chrX,155470375,155641405,0.0
728,chrX,155641406,155958275,1.0


In [29]:
max(primary_large_DUP_heatmap['value'])

7.0

In [30]:
condition = ~((primary_large_DUP_heatmap['chr'] == 'chr12') & 
              (primary_large_DUP_heatmap['start'] >= 17648133) & 
              (primary_large_DUP_heatmap['end'] <= 17987890))

primary_large_DUP_heatmap = primary_large_DUP_heatmap[condition]


In [31]:
max(primary_large_DUP_heatmap['value'])

7.0

In [32]:
len([i for i in somatic_large_DUP_filtered_df_names if i.split('_')[1]=='P'])

13

In [33]:
CIRCOS_primary_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/primary'

primary_large_DUP_heatmap.to_csv((CIRCOS_primary_sample_path + 'large_DUP_heatmap_primary_sample_raw.csv'), index=False, sep=',')

In [34]:
primary_large_DUP_heatmap_nonzero = primary_large_DUP_heatmap[(primary_large_DUP_heatmap['value']!=0)]

In [35]:
CIRCOS_primary_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/primary'

primary_large_DUP_heatmap_nonzero.to_csv((CIRCOS_primary_sample_path + 'large_DUP_heatmap_primary_sample_nonzero.csv'), index=False, sep=',')

In [36]:
primary_large_DUP_heatmap_percentage = primary_large_DUP_heatmap_nonzero.copy()
primary_large_DUP_heatmap_percentage['value'] = primary_large_DUP_heatmap_percentage['value']/13

In [37]:
CIRCOS_primary_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/primary'

primary_large_DUP_heatmap_percentage.to_csv((CIRCOS_primary_sample_path + 'large_DUP_heatmap_primary_sample_percent.csv'), index=False, sep=',')

In [38]:
## Recurrent

recurrent_large_DUP_heatmap = pd.DataFrame(columns=['chr', 'start', 'end', 'value'])

for chrom in chromosomes:
    
    temp_matching_chrom_df_list = [globals()[i][globals()[i]['CHROM']==chrom] \
                                   for i in somatic_large_DUP_filtered_df_names if i.split('_')[1]!='P']
    temp_output_df = heatmap_data(temp_matching_chrom_df_list, chrom)
    
    recurrent_large_DUP_heatmap = pd.concat([recurrent_large_DUP_heatmap.reset_index(drop=True), temp_output_df.reset_index(drop=True)], ignore_index=True)

In [39]:
recurrent_large_DUP_heatmap

Unnamed: 0,chr,start,end,value
0,chr1,590000,950000,1.0
1,chr1,950001,1639999,0.0
2,chr1,1640000,1649999,3.0
3,chr1,1650000,1730000,4.0
4,chr1,1730001,1740000,3.0
...,...,...,...,...
1149,chrY,20943060,21064800,1.0
1150,chrY,21064801,22208280,0.0
1151,chrY,22208281,22262957,1.0
1152,chrY,22262958,22299244,0.0


In [40]:
max(recurrent_large_DUP_heatmap['value'])

11.0

In [41]:
condition = ~((recurrent_large_DUP_heatmap['chr'] == 'chr12') & 
              (recurrent_large_DUP_heatmap['start'] >= 17648133) & 
              (recurrent_large_DUP_heatmap['end'] <= 17987890))

recurrent_large_DUP_heatmap = recurrent_large_DUP_heatmap[condition]


In [42]:
max(recurrent_large_DUP_heatmap['value'])

11.0

In [43]:
len([i for i in somatic_large_DUP_filtered_df_names if i.split('_')[1]!='P'])

18

In [44]:
CIRCOS_recurrent_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/recurrent/'

recurrent_large_DUP_heatmap.to_csv((CIRCOS_recurrent_sample_path + 'large_DUP_heatmap_recurrent_sample_raw.csv'), index=False, sep=',')

In [45]:
recurrent_large_DUP_heatmap_nonzero = recurrent_large_DUP_heatmap[(recurrent_large_DUP_heatmap['value']!=0)]

In [46]:
CIRCOS_recurrent_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/recurrent/'

recurrent_large_DUP_heatmap_nonzero.to_csv((CIRCOS_recurrent_sample_path + 'large_DUP_heatmap_recurrent_sample_nonzero.csv'), index=False, sep=',')

In [47]:
recurrent_large_DUP_heatmap_percentage = recurrent_large_DUP_heatmap_nonzero.copy()
recurrent_large_DUP_heatmap_percentage['value'] = recurrent_large_DUP_heatmap_percentage['value']/23

In [48]:
CIRCOS_recurrent_sample_path = '/Users/ryanyutian/Desktop/Manuscript/CIRCOS/primary_relapse_data/recurrent/'

recurrent_large_DUP_heatmap_percentage.to_csv((CIRCOS_recurrent_sample_path + 'large_DUP_heatmap_recurrent_sample_percent.csv'), index=False, sep=',')