In [15]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

interval_length_max = 250

## Load bed file regions into DataFrame

In [16]:
inFile = "merged10_capture.bed" # merge -d 10 capture bed
intervals_df = pd.read_csv(inFile, sep='\t', names=['chrom', 'start', 'end', 'length'], low_memory=False)
print(len(intervals_df))
chroms = intervals_df['chrom'].unique().tolist()
# print(chroms)

3014
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X']


In [17]:
dist_list = []
for chrom in chroms:
    dist_list.append(0)
    chrom_ints = intervals_df[intervals_df['chrom'] == chrom]
    chrom_starts = chrom_ints['start'].to_list()[1:]
    chrom_ends = chrom_ints['end'].to_list()[:-1]
    for i in range(len(chrom_starts)):
        dist_list.append(chrom_starts[i] - chrom_ends[i])

if len(dist_list) == len(intervals_df):
    intervals_df['dist_before'] = dist_list
    dist_after = dist_list[1:].copy()
    dist_after.append(0) # returns None, happens inplace
    intervals_df['dist_after'] = dist_after

intervals_df.drop(intervals_df[intervals_df['length'] == 1].index, inplace=True)
intervals_df

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
1,1,7917093,7917213,120,17,2649001
3,1,17345175,17345483,308,6778960,3465
4,1,17348948,17349280,332,3465,1157
5,1,17350437,17350599,162,1157,3614
6,1,17354213,17354390,177,3614,674
...,...,...,...,...,...,...
3008,X,130419122,130419470,348,1854,240
3009,X,130419710,130420052,342,240,328
3010,X,130420380,130420467,87,328,81
3011,X,130420548,130420848,300,81,5010509


## Split intervals

In [18]:
interval_list_df = intervals_df.copy()

### Identify regions need splitting

In [19]:
interval_list_df[interval_length_max] = interval_list_df['length'] > interval_length_max
# Create new table for the intervals that are optimised to be between less than 100 bp

# Step 1: copy over smaller intervals intact
intervals_max = interval_list_df.loc[interval_list_df[interval_length_max] == False, ['chrom', 'start', 'end', 'length']].reset_index(drop=True)

In [20]:
# Step 2: split longer intervals into smaller ones within range
def split_intervals(length, interval_length_max):
    num_intervals = int(length / interval_length_max)
    leftover = length % interval_length_max
#     print(length, num_intervals, interval_length_max, leftover)
    if leftover == 0:
        interval_lengths = [interval_length_max] * num_intervals
    else:
        num_intervals += 1
        base_interval_length = int(length / num_intervals)
        spillover = length % num_intervals
#         print(length, num_intervals, base_interval_length, spillover)
        interval_lengths = [base_interval_length] * num_intervals
        for i in range(spillover):
            interval_lengths[i] += 1
    return interval_lengths

def new_intervals(row, int_length):
    new_interval_lengths = split_intervals(row.length, int_length) # list of interval lengths
    chroms = [row.chrom] * len(new_interval_lengths)

    for i in range(len(new_interval_lengths)):
        if i == 0:
            starts = [row.start]
            ends = [starts[0] + new_interval_lengths[i]]
        else:
            starts.append(ends[i-1]) # sum(new_interval_lengths[:i])
            ends.append(starts[i] + new_interval_lengths[i])
    new_intervals = pd.DataFrame({'chrom': chroms, 'start': starts, 'end': ends, 'length': new_interval_lengths})
    global split_intervals_max
    split_intervals_max = pd.concat([split_intervals_max, new_intervals], axis=0)
    return None

split_intervals_max = pd.DataFrame(columns=['chrom', 'start', 'end', 'length'])
interval_list_df.loc[interval_list_df[interval_length_max], ['chrom', 'start', 'end', 'length']].apply(lambda row: new_intervals(row, interval_length_max), axis=1)
print(len(split_intervals_max))

2310


In [21]:
# Step 3: Merge shorter and split intervals

intervals_max = pd.concat([intervals_max, split_intervals_max], axis=0)
intervals_max.sort_values(by=['chrom', 'start'], inplace=True)
intervals_max.reset_index(drop=True)
print(len(intervals_max))

# Save to file
fname = (f"capture_merged10_split{interval_length_max}.bed")
intervals_max.to_csv(fname, sep='\t', index=False, header=False, columns=['chrom', 'start', 'end'])

3769
