In [30]:
import time
import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

In [31]:
RL_COLS = ['rl_15', 'rl_16', 'rl_17', 'rl_18', 'rl_19',
           'rl_20', 'rl_21', 'rl_22', 'rl_23', 'rl_24', 
           'rl_25', 'rl_26', 'rl_27', 'rl_28', 'rl_29', 
           'rl_30', 'rl_31']

In [32]:
df_start_sites = pd.read_csv("../data/positive_strand_start_sites.tsv", sep="\t")
df_reads = pd.read_csv("../data/positive_strand_reads.tsv", sep="\t")

df_start_sites.shape, df_reads.shape

((2109, 2), (4641652, 19))

In [33]:
def non_zero_read_lengths(row):
    return any([row[col] for col in RL_COLS])

df_reads['is_non_zero_length'] = df_reads.progress_apply(non_zero_read_lengths, axis=1)

100%|██████████| 4641652/4641652 [02:44<00:00, 28146.46it/s]


In [34]:
df_reads['is_non_zero_length'].value_counts()

False    4544928
True       96724
Name: is_non_zero_length, dtype: int64

In [9]:
df_reads = df_reads[df_reads['is_non_zero_length']]

df_reads.shape

(96724, 20)

In [39]:
class DataReducer:
    
    def __init__(self, df):
        self.df = df
        self.keep_set = set()

    def reduce(self, row):
        if not row['is_non_zero_length']:
            return
        
        loc_range = set(range(row['position'] - 15, row['position'] + 15))
        self.keep_set = self.keep_set.union(set(loc_range))
        
    def apply(self):
        self.df['keep'] = False
        
        print(str(len(self.keep_set)) + " elements in keep set.")
        
        self.keep_set = list(self.keep_set)
        
        for i in tqdm(range(len(self.keep_set))):
            self.df.loc[self.keep_set[i]-1, 'keep'] = True
        
        print("Done.")
        
        return self.df

In [40]:
data_reducer = DataReducer(df_reads)
_ = df_reads.progress_apply(data_reducer.reduce, axis=1)

100%|██████████| 4641652/4641652 [12:05<00:00, 6400.10it/s] 


In [45]:
df = data_reducer.df
keep_set = data_reducer.keep_set

print(str(len(keep_set)) + " elements in keep set.")

966012 elements in keep set.


In [46]:
df['keep'] = False

keep_set = list(keep_set)

for i in tqdm(range(len(keep_set))):
    df.loc[keep_set[i]-1, 'keep'] = True

100%|██████████| 966012/966012 [12:20<00:00, 1304.13it/s]


In [47]:
df = df[df['keep']]
df.shape

(966012, 21)

In [48]:
df

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_24,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,keep
20,21,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
21,22,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
22,23,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
23,24,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
24,25,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641588,4641589,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
4641589,4641590,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
4641590,4641591,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
4641591,4641592,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True


In [49]:
def calc_num_reads_at_pos(row):
    return sum([row[col] for col in RL_COLS])

df['num_reads'] = df.progress_apply(calc_num_reads_at_pos, axis=1)

100%|██████████| 966012/966012 [00:34<00:00, 27695.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['num_reads'] = df.progress_apply(calc_num_reads_at_pos, axis=1)


In [57]:
sum([1 if x in df['position'] else 0 for x in df_start_sites['start_position'].values])/df_start_sites.shape[0]

0.883357041251778

In [None]:
df.to_csv("../data/keep")