In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

interval_length_min = 50
interval_length_max = 100
interval_length_max2 = 160
interval_length_max3 = 500

## Load bed file regions into DataFrame

In [2]:
inFile = "merged10_capture.bed" # merge -d 10 capture bed
intervals_df = pd.read_csv(inFile, sep='\t', names=['chrom', 'start', 'end', 'length'], low_memory=False)
print(len(intervals_df))

3014


In [3]:
chroms = intervals_df['chrom'].unique().tolist()
print(chroms)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X']


In [4]:
# chroms = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']
# intervals_df['length'] = intervals_df['end'] - intervals_df['start']

In [5]:
dist_list = []
for chrom in chroms:
    dist_list.append(0)
    chrom_ints = intervals_df[intervals_df['chrom'] == chrom]
    chrom_starts = chrom_ints['start'].to_list()[1:]
    chrom_ends = chrom_ints['end'].to_list()[:-1]
    for i in range(len(chrom_starts)):
        dist_list.append(chrom_starts[i] - chrom_ends[i])

print(len(dist_list))
print(len(intervals_df))

3014
3014


In [6]:
# interval_list_df = intervals_df[intervals_df['chrom'].isin(chroms)]
intervals_df['dist_before'] = dist_list
dist_after = dist_list[1:].copy()
dist_after.append(0) # returns None, happens inplace
intervals_df['dist_after'] = dist_after
# intervals_df

In [7]:
intervals_df.drop(intervals_df[intervals_df['length'] == 1].index, inplace=True)
intervals_df

Unnamed: 0,chrom,start,end,length,dist_before,dist_after
1,1,7917093,7917213,120,17,2649001
3,1,17345175,17345483,308,6778960,3465
4,1,17348948,17349280,332,3465,1157
5,1,17350437,17350599,162,1157,3614
6,1,17354213,17354390,177,3614,674
...,...,...,...,...,...,...
3008,X,130419122,130419470,348,1854,240
3009,X,130419710,130420052,342,240,328
3010,X,130420380,130420467,87,328,81
3011,X,130420548,130420848,300,81,5010509


## Visualise intervals

In [9]:
print(interval_list_df['length'].min())
print(interval_list_df['length'].mean())
print(interval_list_df['length'].max())

NameError: name 'interval_list_df' is not defined

In [None]:
f, ax = plt.subplots(figsize=(7, 5))

sns.distplot(interval_list_df['length'], ax=ax, kde=False)

ax.set_title('Interval lengths', fontsize = 18)
ax.set_ylabel('counts', fontsize=18)
ax.set_xlabel('size (b)', fontsize = 18)


In [None]:
f, ax = plt.subplots(figsize=(7, 5))

sns.distplot(interval_list_df.loc[interval_list_df['length'] < 500, 'length'], ax=ax, kde=False)

ax.set_title('Interval lengths', fontsize = 18)
ax.set_ylabel('counts', fontsize=18)
ax.set_xlabel('size (b)', fontsize = 18)


In [None]:
f, ax = plt.subplots(figsize=(7, 5))

sns.distplot(interval_list_df.loc[interval_list_df['length'] < 100, 'length'], ax=ax, kde=False)

ax.set_title('Interval lengths', fontsize = 18)
ax.set_xlim(0, 100)
ax.set_ylabel('counts', fontsize=18)
ax.set_xlabel('size (b)', fontsize = 18)


In [None]:
f, ax = plt.subplots(figsize=(10,7))

sns.distplot(interval_list_df['dist_before'], ax=ax, kde=False)

ax.set_title('Distance before', fontsize = 18)
ax.set_ylabel('counts', fontsize=18)
ax.set_xlabel('size (b)', fontsize = 18)

plt.show()

In [None]:
interval_list_df.groupby('dist_before').count()[:20]

In [None]:
interval_list_df.groupby('length').count()[:20]

In [None]:
interval_list_df.groupby('length').count()[-10:]

In [None]:
interval_list_df[interval_list_df['length'] == 31915] # COL2A1 small exons close to each other

## Split intervals

In [10]:
interval_list_df = intervals_df.copy()

### Identify regions need splitting

In [11]:
interval_list_df[interval_length_min] = interval_list_df['length'] > interval_length_min
interval_list_df[interval_length_max] = interval_list_df['length'] > interval_length_max
interval_list_df[interval_length_max2] = interval_list_df['length'] > interval_length_max2
interval_list_df[interval_length_max3] = interval_list_df['length'] > interval_length_max3
interval_list_df

Unnamed: 0,chrom,start,end,length,dist_before,dist_after,50,100,160,500
1,1,7917093,7917213,120,17,2649001,True,True,False,False
3,1,17345175,17345483,308,6778960,3465,True,True,True,False
4,1,17348948,17349280,332,3465,1157,True,True,True,False
5,1,17350437,17350599,162,1157,3614,True,True,True,False
6,1,17354213,17354390,177,3614,674,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...
3008,X,130419122,130419470,348,1854,240,True,True,True,False
3009,X,130419710,130420052,342,240,328,True,True,True,False
3010,X,130420380,130420467,87,328,81,True,False,False,False
3011,X,130420548,130420848,300,81,5010509,True,True,True,False


### Copy smaller intervals into new dataframe

In [12]:
# Create new table for the intervals that are optimised to be between less than 100 bp

# Step 1: copy over smaller intervals intact
intervals_max_100 = interval_list_df.loc[interval_list_df[interval_length_max] == False, ['chrom', 'start', 'end', 'length']].reset_index(drop=True)
intervals_max_160 = interval_list_df.loc[interval_list_df[interval_length_max2] == False, ['chrom', 'start', 'end', 'length']].reset_index(drop=True)
intervals_max_500 = interval_list_df.loc[interval_list_df[interval_length_max3] == False, ['chrom', 'start', 'end', 'length']].reset_index(drop=True)

### Split larger intervals

In [13]:
# step 2: split longer intervals into smaller ones within range
def split_intervals(length, interval_length_max):
    num_intervals = int(length / interval_length_max)
    leftover = length % interval_length_max
#     print(length, num_intervals, interval_length_max, leftover)
    if leftover == 0:
        interval_lengths = [interval_length_max] * num_intervals
    else:
        num_intervals += 1
        base_interval_length = int(length / num_intervals)
        spillover = length % num_intervals
#         print(length, num_intervals, base_interval_length, spillover)
        interval_lengths = [base_interval_length] * num_intervals
        for i in range(spillover):
            interval_lengths[i] += 1
    return interval_lengths

In [None]:
print(split_intervals(381, 100))

In [18]:
def new_intervals(row, int_length):
    new_interval_lengths = split_intervals(row.length, int_length) # list of interval lengths
    chroms = [row.chrom] * len(new_interval_lengths)

    for i in range(len(new_interval_lengths)):
        if i == 0:
            starts = [row.start]
            ends = [starts[0] + new_interval_lengths[i]]
        else:
            starts.append(ends[i-1]) # sum(new_interval_lengths[:i])
            ends.append(starts[i] + new_interval_lengths[i])
#     print(row.start, row.end)
#     print("starts", starts)
#     print("lengths", new_interval_lengths)
#     print("ends", ends)
#     print(ends[-1] == row.end)
    new_intervals = pd.DataFrame({'chrom': chroms, 'start': starts, 'end': ends, 'length': new_interval_lengths})
    global split_intervals_max_500
    split_intervals_max_500 = pd.concat([split_intervals_max_500, new_intervals], axis=0)
    return None

In [None]:
test_df = interval_list_df.loc[interval_list_df['above100'], ['chrom', 'start', 'end', 'length']].iloc[5:10]
test_df

split_intervals_max_100 = pd.DataFrame(columns=['chrom', 'start', 'end', 'length'])
test_df.apply(lambda row: new_intervals(row, interval_length_max), axis=1)
split_intervals_max_100

In [15]:
split_intervals_max_100 = pd.DataFrame(columns=['chrom', 'start', 'end', 'length'])
interval_list_df.loc[interval_list_df[interval_length_max], ['chrom', 'start', 'end', 'length']].apply(lambda row: new_intervals(row, interval_length_max), axis=1)
print(len(split_intervals_max_100))

7981


In [17]:
split_intervals_max_160 = pd.DataFrame(columns=['chrom', 'start', 'end', 'length'])
interval_list_df.loc[interval_list_df[interval_length_max2], ['chrom', 'start', 'end', 'length']].apply(lambda row: new_intervals(row, interval_length_max2), axis=1)
print(len(split_intervals_max_160))

4800


In [19]:
split_intervals_max_500 = pd.DataFrame(columns=['chrom', 'start', 'end', 'length'])
interval_list_df.loc[interval_list_df[interval_length_max3], ['chrom', 'start', 'end', 'length']].apply(lambda row: new_intervals(row, interval_length_max3), axis=1)
print(len(split_intervals_max_500))

604


## Merge together the originally smaller and the now split intervals

In [20]:
intervals_max_100 = pd.concat([intervals_max_100, split_intervals_max_100], axis=0)
intervals_max_100.sort_values(by=['chrom', 'start'], inplace=True)
intervals_max_100.reset_index(drop=True)
# Save to file
intervals_max_100.to_csv('capture_merged10_split100.bed', sep='\t', index=False, header=False, columns=['chrom', 'start', 'end'])
print(len(intervals_max_100))

8064


In [21]:
intervals_max_160 = pd.concat([intervals_max_160, split_intervals_max_160], axis=0)
intervals_max_160.sort_values(by=['chrom', 'start'], inplace=True)
intervals_max_160.reset_index(drop=True)
intervals_max_160.to_csv('capture_merged10_split160.bed', sep='\t', index=False, header=False, columns=['chrom', 'start', 'end'])
print(len(intervals_max_160))

5426


In [22]:
intervals_max_500 = pd.concat([intervals_max_500, split_intervals_max_500], axis=0)
intervals_max_500.sort_values(by=['chrom', 'start'], inplace=True)
intervals_max_500.reset_index(drop=True)
intervals_max_500.to_csv('capture_merged10_split500.bed', sep='\t', index=False, header=False, columns=['chrom', 'start', 'end'])
print(len(intervals_max_500))

2685


In [None]:
def split_intervals_min(length, interval_length_min):
    num_intervals = int(length / interval_length_min)
    spillover = length % interval_length_min
    base_interval_length = interval_length_min + int(spillover / num_intervals)
    leftover = spillover % num_intervals # which is always less than the number of intervals
    interval_lengths = [base_interval_length] * num_intervals # list of length num_intervals
    print(length, "num int", num_intervals, "spill", spillover, "extra", int(spillover / num_intervals), "left", leftover)
    for i in range(leftover):
        interval_lengths[i] += 1
    return interval_lengths