# Supplementary Table 5. Nick processing
---

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-11-23.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
from __future__ import division
import sys; sys.path.append('/casa/bsc/notebooks/')
from basic import gen_result_dir
resultpath = gen_result_dir('results/')
print 'resultpath:', resultpath

resultpath: results/201123/


In [4]:
import pandas as pd
import numpy as np
import re
import gzip
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
from scipy.stats import spearmanr
from util import *
%matplotlib inline

In [5]:
LIBRARIES = [ 'set1', 'set2', 'set3', 'set4', 'set5' ]

In [6]:
s1 = pd.read_csv('supplementary/201123_s1__DROSHA_dependency.csv', header=1, index_col=0)
s2 = pd.read_csv('supplementary/201123_s2__Construct.csv', header=1, index_col=0).fillna('')
s3 = pd.read_csv('supplementary/201012_s3_input.csv', header=1, index_col=0)
s4 = pd.read_csv('supplementary/201012_s4_cleavage-product.csv', header=1)
s5 = pd.read_csv('supplementary/201012_s5_pri-structure.csv', header=1, index_col=0)

In [7]:
preposition = {}
for mir in s2.index:
    flank5, flank3 = s2.loc[mir,["5' flanking segment","3' flanking segment"]]
    if flank5 and flank3:
        preposition[mir] = (len(flank5)+1,125-len(flank3))
    else:
        preposition[mir] = (0,125)
print len(preposition), preposition['hsa-mir-142']

1881 (32, 90)


In [8]:
clvraw = s4.set_index(['Pri-miRNA','rstart','rend'])
clvtbl = pd.DataFrame()
for lib in LIBRARIES:
    clvtbl[lib] = clvraw[['%s-1'%lib,'%s-2'%lib]].sum(axis=1)
clvtbl = clvtbl.reset_index()
clvtbl.head(1)

Unnamed: 0,Pri-miRNA,rstart,rend,set1,set2,set3,set4,set5
0,hsa-let-7a-1,1,8,4,4,0,0,0


In [9]:
start, mid, end = 1, 63, 125
clv5f = clvtbl[(clvtbl['rstart']==start)&(clvtbl['rend']<mid)].set_index('Pri-miRNA')
clv3f = clvtbl[(clvtbl['rstart']>mid)&(clvtbl['rend']==end)].set_index('Pri-miRNA')
clvhn = clvtbl[(clvtbl['rstart']<mid)&(clvtbl['rend']>mid)]
clvn5 = clvhn[clvhn['rstart']==start]
clvn3 = clvhn[clvhn['rend']==end]
clvh = clvhn.drop(clvn5.index).drop(clvn3.index).set_index('Pri-miRNA')
clvn5 = clvn5.set_index('Pri-miRNA'); clvn3 = clvn3.set_index('Pri-miRNA')

In [10]:
def fill_unfound(tbl):
    found = set(tbl.index)
    unfound = [ mir for mir in s2.index if mir not in found ]
    fill = pd.DataFrame(index=unfound, columns=tbl.columns).fillna(0)
    return tbl.append(fill)

In [11]:
clvtbl = fill_unfound(clvtbl.set_index('Pri-miRNA'))
clv5f = fill_unfound(clv5f)
clv3f = fill_unfound(clv3f)
clvh = fill_unfound(clvh)
clvn5 = fill_unfound(clvn5)
clvn3 = fill_unfound(clvn3)

---

In [32]:
tbl = pd.DataFrame(columns=['Pri-miRNA','5p nick processing','3p nick processing','Set'])
tbl = tbl.set_index('Pri-miRNA')

In [12]:
RCUTOFF = 30
RCUTOFFH = 10
FCUTOFF = .01
WINDOW = 3
INVERTED = 11

In [15]:
def select_lib(mir):
    return max(LIBRARIES, key=lambda x: clvtbl.loc[mir,x].sum())

In [33]:
for mir in s2.index:
    ps, pe = preposition[mir]
    if ps<1 or pe>125:
        continue
    unc5 = clvn5.loc[[mir]].reset_index().set_index(['rend'])
    unc5 = unc5.reindex(range(pe-WINDOW,pe+WINDOW+1)).sum()
    unc3 = clvn3.loc[[mir]].reset_index().set_index(['rstart'])
    unc3 = unc3.reindex(range(ps-WINDOW,ps+WINDOW+1)).sum()
    hpn = clvh.loc[[mir]]
    hpn = hpn[hpn.apply(lambda x: (x['rstart'] in range(ps-WINDOW,ps+WINDOW+1)) and 
                                  (x['rend'] in range(pe-WINDOW,pe+WINDOW+1)),axis=1)].sum()
    lib = select_lib(mir)
    if hpn[lib].sum()>=RCUTOFFH:
        tbl.loc[mir,'5p nick processing'] = (unc3[lib]+1)/(hpn[lib]+1)
        tbl.loc[mir,'3p nick processing'] = (unc5[lib]+1)/(hpn[lib]+1)
        tbl.loc[mir,'Set'] = lib

In [34]:
tbl.to_csv('resources/201123_s5__nick_processing.csv')

In [35]:
out = open('supplementary/201123_s5__Nick_processing.csv', 'wt')
description = 'Supplementary Table 5. Nick processing\n\n\n\n\n'
out.write(description)
for l in open('resources/201123_s5__nick_processing.csv', 'rt'):
    out.write(l)
out.close()