In [3]:
import pandas as pd
import numpy as np
import os
from joblib import Parallel, delayed
from tqdm import tqdm

## convert the IS boundary to TAD

In [4]:
## auto chr
IS_5kb = pd.read_csv('data/inter30_5kb.robust_cutoff_boundaries_100kb.bed', sep='\t', header = None)
IS_5kbnew1 = IS_5kb.loc[:, [0, 1, 2]].copy()
IS_5kbnew1.columns = ['chr','IS_start','IS_end']
IS_5kbnew1['IS_start'] = IS_5kbnew1['IS_start']-1

In [5]:
chr_size = pd.read_csv('data/hg38.chrom.sizes.bed', sep='\t', header = None)
invalid = ['chrX', 'chrY', 'chrMT']
chr_size_new = chr_size[~chr_size[0].str.contains('|'.join(invalid))].copy()
chr_size_new[0] = chr_size_new[0].str.replace('chr','').astype(int)

In [6]:
# Save the current working directory
current_directory = os.getcwd()

directory = 'preprocess_data/'
# Check if the directory exists
if not os.path.exists(directory):
    # Create the directory
    os.makedirs(directory)

# change the path to your own working directory
os.chdir('preprocess_data/')
for k, v in IS_5kbnew1.groupby('chr'):
    v.to_csv(f'IS_boundary_start_end_chr{k}.csv', index = None)

In [7]:
# for k in range (1,23):
#     filename = 'IS_boundary_start_end_chr{}.csv'.format(k)
#     boundary = pd.read_csv(filename, sep=',', header = 0)
#     boundary = boundary.drop_duplicates()
#     start = boundary[['chr', 'IS_end']]
#     start_1 = start.iloc[:-1]
#     start_1.reset_index(inplace = True, drop = True)
#     end =  boundary[['IS_start']]
#     end_1 = end.iloc[1:]
#     end_1.reset_index(inplace = True, drop = True)
#     result = pd.concat([start_1, end_1], axis=1)
#     TAD1_end = boundary.iloc[0]['IS_start'].astype(int)
#     df2 = pd.DataFrame({'chr': [k], 'IS_end' : ['0'], 'IS_start' : TAD1_end})
#     TADlast_start = boundary.iloc[-1]['IS_end'].astype(int)
#     TAD1ast_end = chr_size_new.loc[chr_size_new[0] == k, 1].item()
#     df3 = pd.DataFrame({'chr': [k], 'IS_end' : TADlast_start, 'IS_start' : TAD1ast_end})
#     dfnew = pd.concat([df2, result, df3], ignore_index = True, axis = 0)
#     dfnew.columns = ['chr','TAD_start','TAD_end']
#     out_file = 'IS_TAD_start_end_chr{}.bed'.format(k)
#     dfnew.to_csv(out_file, index = False, sep='\t', header=True)

In [8]:
# Iterate over chromosome numbers from 1 to 22
for k in range(1, 23):
    filename = f'IS_boundary_start_end_chr{k}.csv'
    boundary = pd.read_csv(filename, sep=',', header=0).drop_duplicates()
    start = boundary[['chr', 'IS_end']].iloc[:-1].reset_index(drop=True)
    end = boundary[['IS_start']].iloc[1:].reset_index(drop=True)
    result = pd.concat([start, end], axis=1)
    TAD1_end = boundary.iloc[0]['IS_start'].astype(int)
    df2 = pd.DataFrame({'chr': [k], 'IS_end': ['0'], 'IS_start': TAD1_end})
    TADlast_start = boundary.iloc[-1]['IS_end'].astype(int)
    TAD1ast_end = chr_size_new.loc[chr_size_new[0] == k, 1].item()
    df3 = pd.DataFrame({'chr': [k], 'IS_end': TADlast_start, 'IS_start': TAD1ast_end})
    dfnew = pd.concat([df2, result, df3], ignore_index=True)
    dfnew.columns = ['chr', 'TAD_start', 'TAD_end']
    out_file = f'IS_TAD_start_end_chr{k}.bed'
    dfnew.to_csv(out_file, index=False, sep='\t', header=True)

In [9]:
root, dirs, files = next(os.walk(os.getcwd()))
with open('all_IS_TAD_start_end.bed', 'a') as outfile:
    for infile in files:
        if infile.startswith('IS_TAD_start'):
            df = pd.read_csv(os.path.join(root, infile), sep='\t', skiprows=[0], header = None).drop_duplicates()
            df.to_csv(outfile, index=False, sep='\t', header = None)

In [10]:
f1 = pd.read_csv('all_IS_TAD_start_end.bed', sep='\t', header = None)
f1new = f1[[0, 1, 2]].copy()
f1new.columns = ['chr','TAD_start','TAD_end']
f1new.sort_values(by=['chr', 'TAD_start'], inplace=True)
f1new.to_csv('final_IS_TAD_start_end_5kb.bed', index=False, sep='\t', header = True)
f1new['length'] = f1new['TAD_end'] - f1new['TAD_start']
f1new.reset_index(inplace = True, drop = True)

In [11]:
# Calculate minimums in Pandas without `zero`-values
f1new[f1new > 0].loc[:, 'length'].min()

5000.0

In [12]:
f1new_nozero = f1new[(f1new['length'] > 0)]
f1new_nozero['length'].min()

5000

In [13]:
## for 5kb:
# since min_TAD is 5000bp, which includes 1 bin
# we set that exclude if there is any TAD includes >= 1 NA

### check if the IS_TAD includes NA insulation score regions

In [21]:
os.chdir(current_directory)
IS = pd.read_csv('data/inter30_5kb.insulation_100kb.bed', sep='\t', header = None)
ISnew = IS[[0, 1, 2, 4]].copy()
ISnew.columns = ['chr','IS_start','IS_end','IS']
ISnew['chr'] = ISnew['chr'].str.replace('chr','').astype(str)
invalid = ['X', 'Y', 'MT']
ISnew1 = ISnew.loc[~ISnew['chr'].str.contains('|'.join(invalid)), :].copy()
ISnew1['chr'] = ISnew1['chr'].astype(int)
ISnew1['IS'] = ISnew1['IS'].astype(str)
IS_NA = ISnew1[ISnew1['IS'] == 'nan']

In [22]:
IS_NA

Unnamed: 0,chr,IS_start,IS_end,IS
0,1,1,5000,
1,1,5001,10000,
2,1,10001,15000,
3,1,15001,20000,
4,1,20001,25000,
...,...,...,...,...
575005,22,50795001,50800000,
575006,22,50800001,50805000,
575007,22,50805001,50810000,
575008,22,50810001,50815000,


### find if the IS_TAD includes at least 10% of the length of the TADs are NA IS region

In [23]:
def all_chr_na(chr_i):   
    filename = 'IS_TAD_start_end_chr{}.bed'.format(chr_i)
    TAD = pd.read_csv(filename, sep='\t', header = 0)
    IS_nan = IS_NA[IS_NA.chr == chr_i] 
    overlap = pd.DataFrame(columns=['chr', 'TAD_start', 'TAD_end', 'IS'])
    for start, end in zip(TAD.loc[:, 'TAD_start'], TAD.loc[:, 'TAD_end']):
        temp_overlap = IS_nan[IS_nan.loc[:, 'IS_start'].between(start, end) | IS_nan.loc[:, 'IS_end'].between(start, end)]
        if temp_overlap.shape[0] >= 1:
            temp_overlap2 = temp_overlap.copy(deep = True)
            temp_overlap2['chr'] = IS_nan.loc[:, 'chr']
            temp_overlap2['TAD_start'] = start
            temp_overlap2['TAD_end'] = end
            temp_overlap2['IS'] = IS_nan.loc[:, 'IS']
            temp_overlap2['num_NA'] = int(temp_overlap.shape[0])
            temp_overlap2['num_NA'] = temp_overlap2['num_NA']
            overlap = pd.concat([overlap, temp_overlap2], axis=0).astype(str)

    ### overlap file
    final = overlap[['chr', 'TAD_start', 'TAD_end', 'IS', 'num_NA']].drop_duplicates()
    final['length'] = final['TAD_end'].astype(int) - final['TAD_start'].astype(int)
    #final['include_NA'] = final['lenth'].astype(int)//5000
    out_file = 'NA_TAD_IS_start_end_chr{}.bed'.format(chr_i)
    final.to_csv(out_file, index = False, sep='\t', header=True)
    
    final_larger = final[final['num_NA'].astype(float).astype(int)*5000 > (final['length']//10).astype(int)]
    out_file_1 = 'NA_larger_incluna_TAD_IS_start_end_chr{}.bed'.format(chr_i)
    final_larger.to_csv(out_file_1, index = False, sep='\t', header=True)

In [24]:
import time
os.chdir('preprocess_data/')
start = time.time()
all_data = Parallel(n_jobs=40)(delayed(all_chr_na)(chr_i) for chr_i in range(1,23))
end = time.time()
print('{:.4f} s'.format(end-start))

3.0770 s


### remove the NA_TAD from the all_TAD result

In [25]:
def all_chr_nona(chr_i):   
    filename = 'IS_TAD_start_end_chr{}.bed'.format(chr_i)
    TAD = pd.read_csv(filename, sep='\t', header = 0)
    filename_1 = 'NA_larger_incluna_TAD_IS_start_end_chr{}.bed'.format(chr_i)
    IS_nan = pd.read_csv(filename_1, sep='\t', header = 0, usecols=range(3))
    final = pd.concat([TAD, IS_nan]).drop_duplicates(keep=False)
    out_file = 'noNA_TAD_IS_start_end_chr{}.bed'.format(chr_i)
    final.to_csv(out_file, index = False, sep='\t', header=True)

In [26]:
start = time.time()
all_data = Parallel(n_jobs=40)(delayed(all_chr_nona)(chr_i) for chr_i in range(1,23))
end = time.time()
print('{:.4f} s'.format(end-start))

0.4093 s


In [27]:
root, dirs, files = next(os.walk(os.getcwd()))
with open('all_noNA_IS_TAD_start_end.bed', 'a') as outfile:
    for infile in files:
        if infile.startswith('noNA_TAD_IS'):
            df = pd.read_csv(os.path.join(root, infile), sep='\t', skiprows=[0], header = None).drop_duplicates()
            df.to_csv(outfile, index=False, sep='\t', header = None)

In [38]:
f2 = pd.read_csv('all_noNA_IS_TAD_start_end.bed', sep='\t', header = None)
f2new = f2[[0,1,2]].copy()
f2new.columns = ['chr','TAD_start','TAD_end']
f2new.sort_values(by=['chr', 'TAD_start'], inplace=True)
f2new['length'] = f2new['TAD_end'] - f2new['TAD_start']

In [39]:
### exclude the TAD with zero length
f2new_nozero = f2new[(f2new['length'] > 0)]
f2new_nozero.reset_index(inplace = True, drop = True)

In [40]:
f2new_nozero['length'].max()

7265000

In [41]:
f2new_nozero.to_csv('final_noNA_IS_TAD_start_end_5kb.bed', index=False, sep='\t', header = True)

### find the TADs that converted from IS but not having maxima 

In [42]:
# Add 'chr' prefix to each entry in the 'chr' column to get the IN_TAD with no NA
f2new_nozero = f2new_nozero.copy()
f2new_nozero['chr'] = 'chr' + f2new_nozero['chr'].astype(str)

In [43]:
f2new_nozero

Unnamed: 0,chr,TAD_start,TAD_end,length
0,chr1,870000,975000,105000
1,chr1,980000,1110000,130000
2,chr1,1115000,1300000,185000
3,chr1,1305000,1370000,65000
4,chr1,1375000,1510000,135000
...,...,...,...,...
14437,chr22,50045000,50135000,90000
14438,chr22,50140000,50240000,100000
14439,chr22,50245000,50345000,100000
14440,chr22,50350000,50505000,155000


In [50]:
os.chdir(current_directory)
IS_TAD_containMaxima = pd.read_csv('data/noNA_IS_TAD_containsMaxima.bed', sep='\t', header=None)
IS_TAD_containMaxima.columns = ['#chr', 'TAD_start', 'TAD_end', 'length']

In [51]:
no_maxima = pd.concat([f2new_nozero, IS_TAD_containMaxima]).drop_duplicates(keep=False)

In [52]:
## delete the duplicate overlap_IS_12878

In [62]:
overlap = pd.read_csv('data/overlap_IS_arrow_new_maxima.bed', sep='\t', header = None)
overlap.sort_values(by=[0, 1], inplace=True)
#overlap
# df = overlap.drop_duplicates(subset=[0,1,2], keep='last')
overlap.columns = ["IS_chr", "IS_Start", "IS_End", "IS_TAD_length", "#chr", "TAD_start", "TAD_end"]

In [63]:
overlap

Unnamed: 0,IS_chr,IS_Start,IS_End,IS_TAD_length,#chr,TAD_start,TAD_end
3804,chr1,870000,975000,105000,chr1,885000,970000
4134,chr1,980000,1110000,130000,chr1,980000,1070000
545,chr1,1115000,1300000,185000,chr1,980000,1305000
780,chr1,1375000,1510000,135000,chr1,1320000,1510000
1074,chr1,1750000,1915000,165000,chr1,1760000,1905000
...,...,...,...,...,...,...,...
8686,chr9,136950000,137040000,90000,chr9,136945000,137035000
8687,chr9,137115000,137245000,130000,chr9,137085000,137245000
8688,chr9,137305000,137410000,105000,chr9,137310000,137405000
8689,chr9,137460000,137540000,80000,chr9,137450000,137540000


### find the TADs that arrowhead detected but IS did not

In [64]:
## retrieve the regions that both arrow and IS detected but arrow has subTADs within it (detected more than one TAD)
overlap_subTADs = overlap[overlap.duplicated(['IS_chr', 'IS_Start', 'IS_End'], keep=False)]

In [65]:
## remove overlap_subTADs from overlap to get the overlap_no_subTADs
overlap_no_subTADs = pd.concat([overlap, overlap_subTADs]).drop_duplicates(keep=False)

In [97]:
## overlap_IS is using "overlap" but not "overlap_no_subTADs" to filter because we would like to exculde all 
#overlepped IS and for those having subTADs we are using the arrowhead TADs results from overlap_subTADs
overlap_IS = overlap[['IS_chr', "IS_Start", 'IS_End']]
df_1 = overlap_IS.drop_duplicates(subset=['IS_chr', "IS_Start", 'IS_End'], keep='last').reset_index(drop = True)

In [98]:
ori_IS = pd.read_csv('data/noNA_IS_TAD_containsMaxima.bed', sep='\t', header = 0)
ori_IS.columns = ["IS_chr", "IS_Start", "IS_End", "IS_TAD_length"]
ori_IS = ori_IS[['IS_chr', "IS_Start", 'IS_End']]
no_overlap_1 = pd.concat([df_1, ori_IS]).drop_duplicates(keep=False)  

In [100]:
overlap_arrow = overlap[['#chr', "TAD_start", 'TAD_end']]
df = overlap_arrow.drop_duplicates(subset=['#chr', "TAD_start", 'TAD_end'], keep='last').reset_index(drop=True)

In [101]:
ori_arrow = pd.read_csv('data/arrow_auto_5000_blocks.bedpe', sep='\t', header = 0)
no_overlap = pd.concat([df, ori_arrow]).drop_duplicates(keep=False)  

### generate overlap_arrow and overlap_IS to merge in bedtools merge

In [102]:
overlap_arrow = overlap_no_subTADs[['#chr', "TAD_start", 'TAD_end']]
overlap_arrow.columns = ['chr', 'TAD_start', 'TAD_end']
overlap_IS = overlap_no_subTADs[['IS_chr', "IS_Start", 'IS_End']]

In [103]:
overlap_arrow_IS = pd.concat([overlap_IS, overlap_arrow], axis = 1)

In [104]:
merge_overlap_arrow_IS_startend = pd.DataFrame({'TAD_Start': np.minimum(overlap_arrow_IS['IS_Start'], overlap_arrow_IS['TAD_start']),
                           'TAD_End': np.maximum(overlap_arrow_IS['IS_End'], overlap_arrow_IS['TAD_end'])})

In [105]:
merge_overlap_arrow_IS = pd.concat([overlap_arrow_IS['chr'], merge_overlap_arrow_IS_startend], axis = 1)

In [106]:
merge_overlap_arrow_IS.sort_values(by=['chr', 'TAD_Start'], inplace=True)

### add no overlap from IS, no overlap from arrow and overlap_subTADs to merged_concat_overlapped_IS_arrow_recip.bed

In [107]:
# arrowhead
no_overlap.columns = ['chr', 'TAD_Start', 'TAD_End']
# IS
no_overlap_1.columns = ['chr', 'TAD_Start', 'TAD_End']

In [108]:
overlap_subTADs_df = overlap_subTADs[['#chr', "TAD_start", 'TAD_end']]
overlap_subTADs_df.columns = ['chr', 'TAD_Start', 'TAD_End']

In [109]:
# to merge all
all_TAD = pd.concat([no_overlap_1, no_overlap, overlap_subTADs_df, merge_overlap_arrow_IS])
all_TAD.sort_values(by=['chr', 'TAD_Start'], inplace=True)
all_TAD_nodup = all_TAD.drop_duplicates()
all_TAD_nodup.sort_values(by=['chr', 'TAD_Start', 'TAD_End'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_TAD_nodup.sort_values(by=['chr', 'TAD_Start', 'TAD_End'], inplace=True)


In [110]:
all_TAD_nodup.shape

(18865, 3)