# Cleavage products
1. List up pri-miRNAs with redundant construct sequence
2. Build table
3. Add redundant pri-miRNAs

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2021-01-05.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import re
import gzip
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
%matplotlib inline

### 1. List up pri-miRNAs with redundant construct sequence

In [4]:
## Pri-miRNAs of identical 125mer sequences
s2 = 'supplementary/201012_s2_pri-construct.csv'
s2tbl = pd.read_csv(s2, index_col=0, header=1)
s2redund = s2tbl[s2tbl['Note'].apply(lambda x: str(x).find('Same construct')>=0)]
redundRepre = {}
for pri, note in dict(s2redund['Note']).items():
    repre = note.split()[-1]
    redundRepre[pri] = repre
print len(redundRepre)

30


In [5]:
repreRedund = defaultdict(list)
for mir in redundRepre:
    repreRedund[redundRepre[mir]].append(mir)

In [7]:
prev = pd.read_csv('supplementary/201012_s4_cleavage-product.csv', header=1)
prev.head(1)

Unnamed: 0,Pri-miRNA,rstart,rend,pilot-1,pilot-2,set1-1,set1-2,set2-1,set2-2,set3-1,set3-2,set4-1,set4-2,set5-1,set5-2
0,hsa-let-7a-1,1,8,0,0,2,2,4,0,0,0,0,0,0,0


### 2. Build table

In [8]:
def cigar_to_len(cigar):
    nums = re.split('[A-Z]', cigar)[:-1]
    strs = re.split('[0-9]+', cigar)[1:]
    rlen = sum([int(number) for number, cigStr in zip(nums, strs) if cigStr in 'DM'])
    return rlen

In [9]:
allclvs = 'pilot-1 pilot-2 set1-1 set1-2 set2-1 set2-2 set3-1 set3-2\
           set4-1 set4-2 set5-1 set5-2'.split()
adaptlen = defaultdict(lambda: 20)
adaptlen['pilot-1'] = 0
adaptlen['pilot-2'] = 0

#### Multiply mapped reads

In [10]:
d,dupls = {},{}
for lib in allclvs[2:]:
    summaryfile = 'cleaved/alignments/%s.multi.txt.gz' % lib
    tbl = pd.read_table(gzip.open(summaryfile, 'rb'), sep=' ',
                        names=['Read','Pri-miRNA', 'start_R2', 'cigar', 'start_R1'])
    tbl['rstart'] = tbl['start_R1'].apply(lambda x: max(1, x-adaptlen[lib]))
    tbl['rlen'] = tbl['cigar'].apply(cigar_to_len)
    tbl['rend_tmp'] = tbl['start_R2']+tbl['rlen']-1
    tbl['rend'] = tbl['rend_tmp'].apply(lambda x: min(125, x-adaptlen[lib]))
    tbl = tbl[(tbl['rend']-tbl['rstart'])>=10]
    multicnts = Counter(tbl['Read'])
    for (pri,st,en),grp in tbl.groupby(['Pri-miRNA','rstart','rend']):
        mcnt = multicnts[grp['Read'].tolist()[0]]
        dupls[(pri,int(st),int(en))] = mcnt
    cnttbl = tbl.groupby(['Pri-miRNA','rstart','rend']).size()
    d[lib] = cnttbl
clvtbl = pd.DataFrame(d).fillna(0).astype(int).reset_index()
unclv = clvtbl[(clvtbl['rstart']==1)&(clvtbl['rend']==125)]
clvtbl = clvtbl.drop(unclv.index).set_index(['Pri-miRNA','rstart','rend'])

In [11]:
clvtbl2 = clvtbl.copy()
for (pri,st,en),row in clvtbl.iterrows():
    clvtbl2.loc[(pri,st,en)] = row/dupls[(pri,st,en)]

In [18]:
resetind = clvtbl2.reset_index()
for pri in set(resetind['Pri-miRNA']):
    if pri in repreRedund:
        for pri2 in repreRedund[pri]:
            new = resetind[resetind['Pri-miRNA']==pri].copy()
            new['Pri-miRNA'] = pri2
            resetind = resetind.append(new)
clvtbl2 = resetind.set_index(['Pri-miRNA', 'rstart', 'rend']).sort_index()
clvtbl2.head()

Unnamed: 0,Pri-miRNA,rstart,rend,set1-1,set1-2,set2-1,set2-2,set3-1,set3-2,set4-1,set4-2,set5-1,set5-2
0,hsa-mir-105-1,19,51,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hsa-mir-105-1,20,47,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hsa-mir-105-1,20,51,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hsa-mir-105-1,20,52,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hsa-mir-105-1,20,53,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
clvtbl = prev.append(clvtbl2,ignore_index=True)
clvtbl = clvtbl.drop_duplicates(['Pri-miRNA','rstart','rend']).fillna(0)

In [21]:
clvtbl.to_csv('resources/210105_clvs.csv')

In [17]:
d = {}
for lib in allclvs:
    summaryfile = 'cleaved/alignments/%s.txt.gz' % lib
    tbl = pd.read_table(gzip.open(summaryfile, 'rb'), sep='\t',
                        names=['Pri-miRNA', 'start_R2', 'cigar', 'start_R1'])
    tbl['rstart'] = tbl['start_R1'].apply(lambda x: max(1, x-adaptlen[lib]))
    tbl['rlen'] = tbl['cigar'].apply(cigar_to_len)
    tbl['rend_tmp'] = tbl['start_R2']+tbl['rlen']-1
    tbl['rend'] = tbl['rend_tmp'].apply(lambda x: min(125, x-adaptlen[lib]))
    cnttbl = tbl.groupby(['Pri-miRNA', 'rstart', 'rend']).size()
    d[lib] = cnttbl
clvtbl = pd.DataFrame(d).fillna(0).astype(int).reset_index()
unclv = clvtbl[(clvtbl['rstart']==1)&(clvtbl['rend']==125)]
clvtbl = clvtbl.drop(unclv.index).set_index(['Pri-miRNA','rstart','rend'])

In [None]:
print len(set(clvtbl.index.get_level_values(0)))

#### 3. Add redundant pri-miRNAs

In [10]:
resetind = clvtbl.reset_index()
for pri in set(resetind['Pri-miRNA']):
    if pri in repreRedund:
        for pri2 in repreRedund[pri]:
            new = resetind[resetind['Pri-miRNA']==pri].copy()
            new['Pri-miRNA'] = pri2
            resetind = resetind.append(new)
clvtbl = resetind.set_index(['Pri-miRNA', 'rstart', 'rend']).sort_index()
print len(set(clvtbl.index.get_level_values(0)))
clvtbl.head()

1881


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pilot-1,pilot-2,set1-1,set1-2,set2-1,set2-2,set3-1,set3-2,set4-1,set4-2,set5-1,set5-2
Pri-miRNA,rstart,rend,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
hsa-let-7a-1,1,8,0,0,2,2,4,0,0,0,0,0,0,0
hsa-let-7a-1,1,9,0,0,6,2,0,0,0,0,0,0,0,0
hsa-let-7a-1,1,10,55,26,4,4,0,1,0,0,0,0,0,0
hsa-let-7a-1,1,11,19,18,1,1,0,0,0,0,0,0,0,0
hsa-let-7a-1,1,12,606,314,5,3,0,0,0,0,0,0,0,0


In [11]:
clvtbl.to_csv('resources/201012_s4_cleavage-product.csv')

In [12]:
out = open('supplementary/201012_s4_cleavage-product.csv', 'wt')
description = 'Supplementary Table 4. Cleavage products\n\n\n\n\n'
out.write(description)
for l in open('resources/201012_s4_cleavage-product.csv', 'rt'):
    out.write(l)
out.close()