In [30]:
import time
import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import matplotlib.pyplot as plt

In [31]:
RL_COLS = ['rl_15', 'rl_16', 'rl_17', 'rl_18', 'rl_19',
           'rl_20', 'rl_21', 'rl_22', 'rl_23', 'rl_24', 
           'rl_25', 'rl_26', 'rl_27', 'rl_28', 'rl_29', 
           'rl_30', 'rl_31']

In [32]:
df_start_sites = pd.read_csv("../data/positive_strand_start_sites.tsv", sep="\t")
df_reads = pd.read_csv("../data/positive_strand_reads.tsv", sep="\t")

df_start_sites.shape, df_reads.shape

((2109, 2), (4641652, 19))

In [33]:
def non_zero_read_lengths(row):
    return any([row[col] for col in RL_COLS])

df_reads['is_non_zero_length'] = df_reads.progress_apply(non_zero_read_lengths, axis=1)

100%|██████████| 4641652/4641652 [02:44<00:00, 28146.46it/s]


In [34]:
df_reads['is_non_zero_length'].value_counts()

False    4544928
True       96724
Name: is_non_zero_length, dtype: int64

In [9]:
df_reads = df_reads[df_reads['is_non_zero_length']]

df_reads.shape

(96724, 20)

In [39]:
class DataReducer:
    
    def __init__(self, df, window_len=15, verbose=True):
        self.df = df
        self.keep_set = set()
        self.verbose = verbose
        self.window_len = window_len

    def reduce(self, row):
        if not row['is_non_zero_length']:
            return
        
        loc_range = set(range(row['position'] - self.window_len, 
                              row['position'] + self.window_len + 1))
        
        self.keep_set = self.keep_set.union(set(loc_range))
        
    def apply(self):
        self.df['keep'] = False
        
        if self.verbose:
            print(str(len(self.keep_set)) + " elements in keep set.")
        
        self.keep_set = list(self.keep_set)
        
        for i in tqdm(range(len(self.keep_set))):
            self.df.loc[self.keep_set[i]-1, 'keep'] = True
        
        if self.verbose:
            print("Done.")
        
        return self.df

In [40]:
data_reducer = DataReducer(df_reads)
_ = df_reads.progress_apply(data_reducer.reduce, axis=1)

100%|██████████| 4641652/4641652 [12:05<00:00, 6400.10it/s] 


In [45]:
df = data_reducer.df
keep_set = data_reducer.keep_set

print(str(len(keep_set)) + " elements in keep set.")

966012 elements in keep set.


In [46]:
df['keep'] = False

keep_set = list(keep_set)

for i in tqdm(range(len(keep_set))):
    df.loc[keep_set[i]-1, 'keep'] = True

100%|██████████| 966012/966012 [12:20<00:00, 1304.13it/s]


In [47]:
df = df[df['keep']]
df.shape

(966012, 21)

In [48]:
df

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_24,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,keep
20,21,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
21,22,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
22,23,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
23,24,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
24,25,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641588,4641589,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
4641589,4641590,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
4641590,4641591,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
4641591,4641592,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True


In [72]:
df[df['is_non_zero_length']].head(30)

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,keep,num_reads
35,36,C,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,1
62,63,A,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,4
69,70,C,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,1
70,71,T,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,True,True,1
78,79,T,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,1
90,91,G,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,1
91,92,T,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,2
92,93,G,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,True,True,2
95,96,T,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,1
96,97,A,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,True,True,3


In [71]:
df_start_sites

Unnamed: 0,gene,start_position
0,b0001,190
1,b0002,337
2,b0003,2801
3,b0004,3734
4,b0005,5234
...,...,...
2104,b4398,4636007
2105,b4399,4636696
2106,b4400,4638178
2107,b4402,4640402


In [49]:
def calc_num_reads_at_pos(row):
    return sum([row[col] for col in RL_COLS])

df['num_reads'] = df.progress_apply(calc_num_reads_at_pos, axis=1)

100%|██████████| 966012/966012 [00:34<00:00, 27695.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['num_reads'] = df.progress_apply(calc_num_reads_at_pos, axis=1)


In [57]:
sum([1 if x in df['position'] else 0 for x in df_start_sites['start_position'].values])/df_start_sites.shape[0]

0.883357041251778

In [58]:
df.to_csv("../data/keep_data.csv")

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,keep,num_reads
20,21,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
21,22,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
22,23,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23,24,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
24,25,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641588,4641589,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
4641589,4641590,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
4641590,4641591,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
4641591,4641592,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0


In [69]:
pos = 36

loc_left = list(range(pos - 15, pos))
loc_right = list(range(pos + 1, pos + 15))

x_left = df[df['position'].isin(loc_left)]
x_right = df[df['position'].isin(loc_right)]

str_left = ''.join(x_right['letter'].values)
str_right = ''.join(x_right['letter'].values)

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,keep,num_reads
20,21,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
21,22,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
22,23,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23,24,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
24,25,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
25,26,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
26,27,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
27,28,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
28,29,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
29,30,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0


In [79]:
l = list(np.concatenate([x_left['num_reads'].values, x_right['num_reads'].values]))
l = l + [str_left, str_right]

In [82]:
loc_left

[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]

In [87]:


col_names

['pos-15',
 'pos-14',
 'pos-13',
 'pos-12',
 'pos-11',
 'pos-10',
 'pos-9',
 'pos-8',
 'pos-7',
 'pos-6',
 'pos-5',
 'pos-4',
 'pos-3',
 'pos-2',
 'pos-1',
 'pos+0',
 'pos+1',
 'pos+2',
 'pos+3',
 'pos+4',
 'pos+5',
 'pos+6',
 'pos+7',
 'pos+8',
 'pos+9',
 'pos+10',
 'pos+11',
 'pos+12',
 'pos+13',
 'pos+14',
 'substr_left',
 'substr_right']

In [103]:
class SignalSampler:
    
    def __init__(self, df, window_len=15, verbose=True):
        self.df = df
        self.verbose = verbose
        self.window_len = window_len
        
        indices = list(range(-self.window_len, self.window_len))
        col_names = ['pos+' + str(x) if x >= 0 else 'pos' + str(x) for x in indices]
        self.col_names = col_names + ['substr_left', 'substr_right']
        
    def _create_feat_vec(self, row, opt="null"):
        if opt == "null":
            return [np.nan for _ in range(2*self.window_len + 2)]
        elif opt == "sig":            
            loc_left = list(range(row['position'] - self.window_len, row['position']))
            loc_right = list(range(row['position'] + 1, row['position'] + self.window_len))
            
            df_left = self.df[self.df['position'].isin(loc_left)]
            df_right = self.df[self.df['position'].isin(loc_right)]
            
            str_left = ''.join(df_left['letter'].values)
            str_right = ''.join(df_right['letter'].values)
            
            left_reads = list(df_left['num_reads'].values)
            right_reads = list(df_right['num_reads'].values)
            
            all_reads = left_reads + [row['num_reads']] + right_reads
            
            feat_vec = all_reads + [str_left, str_right]
            return feat_vec
        
        raise ValueError("opt can only be 'null' or 'sig', not", str(opt), ".")    
        
    def reduce(self, row):
        if not row['is_non_zero_length']:
            feat_vec = self._create_feat_vec(row, opt="null")
        else:
            feat_vec = self._create_feat_vec(row, opt="sig")
        
        err_str = str(row['position']) + " length is " + str(len(feat_vec))
        assert len(feat_vec) == len(self.col_names), feat_vec
        
        return feat_vec

In [114]:
test_df = df[:10000].sort_values(['position'])

In [115]:
sig_sampler = SignalSampler(test_df)

print(len(sig_sampler.col_names))

32


In [117]:
test_df[sig_sampler.col_names] = test_df.progress_apply(sig_sampler.reduce, axis=1, result_type="expand")

100%|██████████| 10000/10000 [00:01<00:00, 8085.55it/s]


In [124]:
test_df[test_df['substr_left'].notnull()]

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,pos+7,pos+8,pos+9,pos+10,pos+11,pos+12,pos+13,pos+14,substr_left,substr_right
35,36,C,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ACGGGCAATATGTCT,TGTGTGGATTAAAA
62,63,A,4,0,0,0,0,0,0,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,AAAAAAGAGTGTCTG,TAGCAGCTTCTGAA
69,70,C,0,0,1,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,AGTGTCTGATAGCAG,TTCTGAACTGGTTA
70,71,T,0,0,0,0,1,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,GTGTCTGATAGCAGC,TCTGAACTGGTTAC
78,79,T,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,TAGCAGCTTCTGAAC,GGTTACCTGCCGTG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22900,22901,A,1,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ATCCGCGCGCTGGGC,AAATCATCGGCAAC
22926,22927,C,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,CAACGGTCACCTGCA,AAAGGCGCGAAGCC
22940,22941,C,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ACAAAGGCGCGAAGC,AGTTCACTGGTGCG
22955,22956,T,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CAGTTCACTGGTGCG,TGACTGCCGTTCTG


In [126]:
sig_sampler = SignalSampler(df)
print(len(sig_sampler.col_names))
df[sig_sampler.col_names] = df.progress_apply(sig_sampler.reduce, axis=1, result_type="expand")

32


100%|██████████| 966012/966012 [35:41<00:00, 451.18it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [130]:
df.to_csv("../data/positive_strand_with_features.csv")

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,pos+7,pos+8,pos+9,pos+10,pos+11,pos+12,pos+13,pos+14,substr_left,substr_right
35,36,C,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ACGGGCAATATGTCT,TGTGTGGATTAAAA
62,63,A,4,0,0,0,0,0,0,0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,AAAAAAGAGTGTCTG,TAGCAGCTTCTGAA
69,70,C,0,0,1,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,AGTGTCTGATAGCAG,TTCTGAACTGGTTA
70,71,T,0,0,0,0,1,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,GTGTCTGATAGCAGC,TCTGAACTGGTTAC
78,79,T,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,TAGCAGCTTCTGAAC,GGTTACCTGCCGTG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641397,4641398,T,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GTGATGGTCTATTGC,ATCAATTAGCAACA
4641439,4641440,G,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CCGGCGAAAAGTGAT,CAACGGCAGACCAA
4641489,4641490,A,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CGAACGAGCCATGAC,TTGCTGACGACTCT
4641532,4641533,G,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GACATAAAACTGGTC,ACTGGTTACAACAA


In [110]:
df[df['position'].isin(range(23081-15, 23081+15))]

Unnamed: 0,position,letter,rl_15,rl_16,rl_17,rl_18,rl_19,rl_20,rl_21,rl_22,...,rl_25,rl_26,rl_27,rl_28,rl_29,rl_30,rl_31,is_non_zero_length,keep,num_reads
23065,23066,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23066,23067,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23067,23068,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23068,23069,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23069,23070,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23070,23071,C,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23071,23072,G,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23072,23073,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23073,23074,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
23074,23075,A,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,False,True,0
