In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

interval_length_max = 250

# Load capture bed file into DataFrame

## Prep df

In [4]:
capture_file = "CEN_capture_v1.0.0.bed"
capture_df = pd.read_csv(capture_file, sep='\t', names=['chrom', 'start', 'end'], low_memory=False)
capture_df.head()

Unnamed: 0,chrom,start,end
0,1,7917075,7917076
1,1,7917093,7917213
2,1,10566214,10566215
3,1,17345175,17345483
4,1,17345430,17345431


In [5]:
capture_df['length'] = capture_df['end'] - capture_df['start']
len(capture_df)

52052

In [10]:
chroms = capture_df['chrom'].unique().tolist()
dist_list = []
for chrom in chroms:
    dist_list.append(0)
    chrom_ints = capture_df[capture_df['chrom'] == chrom]
    chrom_starts = chrom_ints['start'].to_list()[1:]
    chrom_ends = chrom_ints['end'].to_list()[:-1]
    for i in range(len(chrom_starts)):
        dist_list.append(chrom_starts[i] - chrom_ends[i])

if len(dist_list) == len(capture_df):
    capture_df['dist_before'] = dist_list
    dist_after = dist_list[1:].copy()
    dist_after.append(0) # returns None, happens inplace
    capture_df['dist_after'] = dist_after

capture_df.head()

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
1,1,7917093,7917213,120,0,9427962
3,1,17345175,17345483,308,9427962,3465
12,1,17348948,17349068,120,3465,-29
13,1,17349039,17349159,120,-29,-92
14,1,17349067,17349108,41,-92,-36


## Remove intervals with length <= 3
and save capture bed without regions of length 1,2,3 to a bed file

In [11]:
capture_df[capture_df['dist_before'] < 10].groupby('dist_before').count()

Unnamed: 0_level_0,chrom,start,end,length,dist_after
dist_before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-29481,1,1,1,1,1
-6760,1,1,1,1,1
-4795,1,1,1,1,1
-4367,1,1,1,1,1
-3467,1,1,1,1,1
...,...,...,...,...,...
5,2,2,2,2,2
6,2,2,2,2,2
7,1,1,1,1,1
8,2,2,2,2,2


In [12]:
capture_df[capture_df['length'] < 50].groupby('length').count()

Unnamed: 0_level_0,chrom,start,end,dist_before,dist_after
length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,1,1,1,1,1
5,2,2,2,2,2
6,1,1,1,1,1
7,1,1,1,1,1
9,1,1,1,1,1
12,1,1,1,1,1
22,2,2,2,2,2
23,43,43,43,43,43
24,41,41,41,41,41
25,47,47,47,47,47


In [None]:
print(len(capture_df[capture_df['length'] < 4]))
capture_df[capture_df['length'] < 4].groupby('length').count()

In [7]:
capture_df.drop(capture_df[capture_df['length'] < 4].index, inplace=True)
len(capture_df)

4172

In [8]:
capture_fname = (f"CEN_capture_remove123.bed")
capture_df.to_csv(capture_fname, sep='\t', index=False, header=False, columns=['chrom', 'start', 'end'])

# Bedtools merge regions
in the terminal ```bedtools merge -d 10 -i CEN_capture_remove123.bed > CEN_capture_remove123_merged10.bed```

# Load removed 123 and merged 10 bed file into DataFrame

## prep df

In [15]:
merged1_file = "CEN_capture_remove123_merged1.bed"
# merged10_file = "CEN_capture_remove123_merged10.bed"
merged1_df = pd.read_csv(merged1_file, sep='\t', names=['chrom', 'start', 'end'])
# merged10_df = pd.read_csv(merged10_file, sep='\t', names=['chrom', 'start', 'end'])
merged1_df['length'] = merged1_df['end'] - merged1_df['start']
print(len(merged1_df))
merged1_df.head()

2239


Unnamed: 0,chrom,start,end,length
0,1,7917093,7917213,120
1,1,17345175,17345483,308
2,1,17348948,17349280,332
3,1,17350437,17350599,162
4,1,17354213,17354390,177


In [16]:
chroms = merged1_df['chrom'].unique().tolist()
dist_list = []
for chrom in chroms:
    dist_list.append(0)
    chrom_ints = merged1_df[merged1_df['chrom'] == chrom]
    chrom_starts = chrom_ints['start'].to_list()[1:]
    chrom_ends = chrom_ints['end'].to_list()[:-1]
    for i in range(len(chrom_starts)):
        dist_list.append(chrom_starts[i] - chrom_ends[i])

if len(dist_list) == len(merged1_df):
    merged1_df['dist_before'] = dist_list
    dist_after = dist_list[1:].copy()
    dist_after.append(0) # returns None, happens inplace
    merged1_df['dist_after'] = dist_after

merged1_df.head()

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
0,1,7917093,7917213,120,0,9427962
1,1,17345175,17345483,308,9427962,3465
2,1,17348948,17349280,332,3465,1157
3,1,17350437,17350599,162,1157,3614
4,1,17354213,17354390,177,3614,674


In [22]:
merged1_df[merged1_df['length'] < 100].groupby('length').count()

Unnamed: 0_level_0,chrom,start,end,dist_before,dist_after
length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
72,1,1,1,1,1
81,1,1,1,1,1
84,3,3,3,3,3
87,2,2,2,2,2
90,2,2,2,2,2
92,3,3,3,3,3
93,6,6,6,6,6
94,1,1,1,1,1
95,1,1,1,1,1
96,13,13,13,13,13


In [24]:
merged1_df[merged1_df['dist_before'] < 20].groupby('dist_before').count()

Unnamed: 0_level_0,chrom,start,end,length,dist_after
dist_before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,23,23,23,23,23
1,1,1,1,1,1
2,1,1,1,1,1
6,1,1,1,1,1
9,1,1,1,1,1
12,2,2,2,2,2
13,1,1,1,1,1
14,3,3,3,3,3
15,1,1,1,1,1
16,2,2,2,2,2


In [28]:
merged1_df.loc[(merged1_df['dist_before'] < 11) & (merged1_df['dist_before'] != 0)]

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
1097,9,140127630,140127886,256,6,168
1570,16,2140623,2141205,582,2,127
1638,17,15036412,15036532,120,1,97501
2158,X,49113351,49113499,148,9,354


In [36]:
close = merged1_df.loc[(merged1_df['dist_before'] < 11) & (merged1_df['dist_before'] != 0)].index.to_list()
pair = [i-1 for i in close]
close_ones = close + pair
merged1_df.iloc[close_ones].sort_index()

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
1096,9,140127205,140127624,419,20,6
1097,9,140127630,140127886,256,6,168
1569,16,2140255,2140621,366,25,2
1570,16,2140623,2141205,582,2,127
1637,17,15036291,15036411,120,7445537,1
1638,17,15036412,15036532,120,1,97501
2157,X,49113177,49113342,165,884,9
2158,X,49113351,49113499,148,9,354


## Split intervals

In [38]:
merged10_file = "CEN_capture_remove123_merged10.bed" # merge -d 10 capture bed
merged10_df = pd.read_csv(merged10_file, sep='\t', names=['chrom', 'start', 'end'])
merged10_df['length'] = merged10_df['end'] - merged10_df['start']
print(len(merged10_df))

2235


In [39]:
chroms = merged10_df['chrom'].unique().tolist()
dist_list = []
for chrom in chroms:
    dist_list.append(0)
    chrom_ints = merged10_df[merged10_df['chrom'] == chrom]
    chrom_starts = chrom_ints['start'].to_list()[1:]
    chrom_ends = chrom_ints['end'].to_list()[:-1]
    for i in range(len(chrom_starts)):
        dist_list.append(chrom_starts[i] - chrom_ends[i])

if len(dist_list) == len(merged10_df):
    merged10_df['dist_before'] = dist_list
    dist_after = dist_list[1:].copy()
    dist_after.append(0) # returns None, happens inplace
    merged10_df['dist_after'] = dist_after

merged10_df.head()

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
0,1,7917093,7917213,120,0,9427962
1,1,17345175,17345483,308,9427962,3465
2,1,17348948,17349280,332,3465,1157
3,1,17350437,17350599,162,1157,3614
4,1,17354213,17354390,177,3614,674


### Identify regions need splitting

In [41]:
interval_list_df = merged10_df.copy()
interval_list_df[interval_length_max] = interval_list_df['length'] > interval_length_max
# Create new table for the intervals that are optimised to be between less than 100 bp

# Step 1: copy over smaller intervals intact
intervals_max = interval_list_df.loc[interval_list_df[interval_length_max] == False, ['chrom', 'start', 'end', 'length']].reset_index(drop=True)

In [42]:
# Step 2: split longer intervals into smaller ones within range
def split_intervals(length, interval_length_max):
    num_intervals = int(length / interval_length_max)
    leftover = length % interval_length_max
#     print(length, num_intervals, interval_length_max, leftover)
    if leftover == 0:
        interval_lengths = [interval_length_max] * num_intervals
    else:
        num_intervals += 1
        base_interval_length = int(length / num_intervals)
        spillover = length % num_intervals
#         print(length, num_intervals, base_interval_length, spillover)
        interval_lengths = [base_interval_length] * num_intervals
        for i in range(spillover):
            interval_lengths[i] += 1
    return interval_lengths

def new_intervals(row, int_length):
    new_interval_lengths = split_intervals(row.length, int_length) # list of interval lengths
    chroms = [row.chrom] * len(new_interval_lengths)

    for i in range(len(new_interval_lengths)):
        if i == 0:
            starts = [row.start]
            ends = [starts[0] + new_interval_lengths[i]]
        else:
            starts.append(ends[i-1]) # sum(new_interval_lengths[:i])
            ends.append(starts[i] + new_interval_lengths[i])
    new_intervals = pd.DataFrame({'chrom': chroms, 'start': starts, 'end': ends, 'length': new_interval_lengths})
    global split_intervals_max
    split_intervals_max = pd.concat([split_intervals_max, new_intervals], axis=0)
    return None

In [43]:
split_intervals_max = pd.DataFrame(columns=['chrom', 'start', 'end', 'length'])
interval_list_df.loc[interval_list_df[interval_length_max], ['chrom', 'start', 'end', 'length']].apply(lambda row: new_intervals(row, interval_length_max), axis=1)
print(len(split_intervals_max))

2310


In [44]:
# Step 3: Merge df of shorter and split intervals
intervals_max = pd.concat([intervals_max, split_intervals_max], axis=0)
intervals_max.sort_values(by=['chrom', 'start'], inplace=True)
intervals_max.reset_index(drop=True)
print(len(intervals_max))

3724


In [45]:
# Save to file
fname = (f"CEN_capture_remove123_merged10_split{interval_length_max}.bed")
intervals_max.to_csv(fname, sep='\t', index=False, header=False, columns=['chrom', 'start', 'end'])