In [1]:
import time
import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
RL_COLS = ['rl_15', 'rl_16', 'rl_17', 'rl_18', 'rl_19',
           'rl_20', 'rl_21', 'rl_22', 'rl_23', 'rl_24', 
           'rl_25', 'rl_26', 'rl_27', 'rl_28', 'rl_29', 
           'rl_30', 'rl_31']

RL_COLS_REVERSED = [col for col in reversed(RL_COLS)]

In [3]:
df_start_sites = pd.read_csv("../data/positive_strand_start_sites.tsv", sep="\t")
df_reads = pd.read_csv("../data/positive_strand_reads.tsv", sep="\t")

df_start_sites.shape, df_reads.shape

((2109, 2), (4641652, 19))

In [48]:
def non_zero_read_lengths(row):
    return any([row[col] for col in RL_COLS])

def calc_num_reads_at_pos(row):
    return sum([row[col] for col in RL_COLS])

def largest_read_length(row):
    largest_rl = 0
    for col in RL_COLS_REVERSED:
        if row[col] > 0:
            largest_rl = int(col.split('_')[1])
            break
            
    return largest_rl

# df_reads['is_non_zero_length'] = df_reads.progress_apply(non_zero_read_lengths, axis=1)
# df_reads['num_reads'] = df_reads.progress_apply(calc_num_reads_at_pos, axis=1)
df_reads['largest_rl'] = df_reads.progress_apply(largest_read_length, axis=1)

100%|██████████| 4641652/4641652 [02:43<00:00, 28329.57it/s]


In [59]:
df_reads['is_non_zero_length'].value_counts()

False    4544928
True       96724
Name: is_non_zero_length, dtype: int64

In [60]:
df_reads

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,num_reads,largest_rl
0,1,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
1,2,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
2,3,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
3,4,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
4,5,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641647,4641648,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
4641648,4641649,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
4641649,4641650,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0
4641650,4641651,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,0,0


In [61]:
class DataSelector:
    
    def __init__(self, df, threshold=0):
        self.df = df.copy()
        self.threshold = threshold
        
        self.df["is_selected"] = False
    
    def select(self, row):
        if not row['is_non_zero_length'] or row['num_reads'] < self.threshold:
            return
        
        largest_read_length = row['largest_rl']
                
        first_index = row['position'] - largest_read_length - 1
        last_index = row['position'] - 1
        
        self.df.loc[first_index:last_index, 'is_selected'] = True

In [62]:
test_df = df_reads.loc[0:100000]

In [63]:
test_df['is_non_zero_length'].value_counts()

False    97296
True      2705
Name: is_non_zero_length, dtype: int64

In [64]:
data_selector = DataSelector(test_df)
_ = test_df.progress_apply(data_selector.select, axis=1)

100%|██████████| 100001/100001 [00:00<00:00, 113099.38it/s]


In [65]:
data_selector.df['is_selected'].value_counts()

False    78786
True     21215
Name: is_selected, dtype: int64

Test Selections

In [66]:
select_df = data_selector.df

In [67]:
rs_row = select_df[select_df['is_non_zero_length']].sample(n=1)
rs_position = rs_row.position

largest_read_length = 0
for col in RL_COLS_REVERSED:
    if rs_row[col].values[0] > 0:
        largest_read_length = int(col.split('_')[1])
        break

largest_read_length

16

In [68]:
rs_position

90466    90467
Name: position, dtype: int64

In [70]:
select_df.loc[3770:3799][["position", "largest_rl", "is_non_zero_length", "num_reads", "is_selected"]]

Unnamed: 0,position,largest_rl,is_non_zero_length,num_reads,is_selected
3770,3771,0,False,0,False
3771,3772,0,False,0,False
3772,3773,0,False,0,False
3773,3774,0,False,0,False
3774,3775,0,False,0,True
3775,3776,0,False,0,True
3776,3777,0,False,0,True
3777,3778,0,False,0,True
3778,3779,0,False,0,True
3779,3780,0,False,0,True


In [54]:
select_df[select_df.index == 3792][RL_COLS]

Unnamed: 0,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,rl_23,rl_24,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31
3792,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [80]:
select_df = select_df.join(pd.get_dummies(select_df['letter']))

In [83]:
select_cols = RL_COLS + ['A', 'C', 'G', 'T']

def select_df_sample(df, idx, window_len, select_cols):
    return df.loc[idx - window_len: idx + window_len][select_cols]

In [87]:
select_df_sample(select_df, 3790, 15, select_cols)

Unnamed: 0,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,rl_23,rl_24,...,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,A,C,G,T
3775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3779,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
