In [1]:
import time
import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
RL_COLS = ['rl_15', 'rl_16', 'rl_17', 'rl_18', 'rl_19',
           'rl_20', 'rl_21', 'rl_22', 'rl_23', 'rl_24', 
           'rl_25', 'rl_26', 'rl_27', 'rl_28', 'rl_29', 
           'rl_30', 'rl_31']

RL_COLS_REVERSED = [col for col in reversed(RL_COLS)]

In [3]:
df_start_sites = pd.read_csv("../data/positive_strand_start_sites.tsv", sep="\t")
df_reads = pd.read_csv("../data/positive_strand_reads.tsv", sep="\t")

df_start_sites.shape, df_reads.shape

((2109, 2), (4641652, 19))

In [7]:
def non_zero_read_lengths(row):
    return any([row[col] for col in RL_COLS])

def calc_num_reads_at_pos(row):
    return sum([row[col] for col in RL_COLS])

df_reads['is_non_zero_length'] = df_reads.progress_apply(non_zero_read_lengths, axis=1)
df_reads['num_reads'] = df_reads.progress_apply(calc_num_reads_at_pos, axis=1)

100%|██████████| 4641652/4641652 [02:50<00:00, 27272.75it/s]


In [5]:
df_reads['is_non_zero_length'].value_counts()

False    4544928
True       96724
Name: is_non_zero_length, dtype: int64

In [8]:
df_reads

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_24,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,num_reads
0,1,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
1,2,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
2,3,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
3,4,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
4,5,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641647,4641648,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
4641648,4641649,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
4641649,4641650,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0
4641650,4641651,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,0


In [19]:
class DataSelector:
    
    def __init__(self, df, threshold=0):
        self.df = df.copy()
        self.threshold = threshold
        
        self.df["is_selected"] = False
    
    def select(self, row):
        if not row['is_non_zero_length'] or row['num_reads'] < self.threshold:
            return
        
        largest_read_length = 0
        for col in RL_COLS_REVERSED:
            if row[col] > 0:
                largest_read_length = int(col.split('_')[1])
                break
                
                
        first_index = row['position'] - largest_read_length - 1
        last_index = row['position'] + 1
        
        self.df.loc[first_index:last_index, 'is_selected'] = True

In [13]:
test_df = df_reads.loc[0:100000]

In [14]:
test_df['is_non_zero_length'].value_counts()

False    97296
True      2705
Name: is_non_zero_length, dtype: int64

In [21]:
data_selector = DataSelector(test_df)
_ = test_df.progress_apply(data_selector.select, axis=1)

100%|██████████| 100001/100001 [00:00<00:00, 103914.43it/s]


In [24]:
data_selector.df['is_selected'].value_counts()

False    77384
True     22617
Name: is_selected, dtype: int64