# Supplementary Table 3. DROSHA processing result
---

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-11-23.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
from __future__ import division
import sys; sys.path.append('/casa/bsc/notebooks/')
from basic import gen_result_dir
resultpath = gen_result_dir('results/')
print 'resultpath:', resultpath

resultpath: results/201123/


In [4]:
import pandas as pd
import numpy as np
import re
import gzip
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
from scipy.stats import spearmanr
from util import *
%matplotlib inline

In [5]:
LIBRARIES = [ 'set1', 'set2', 'set3', 'set4', 'set5' ]

In [6]:
s1 = pd.read_csv('supplementary/201123_s1__DROSHA_dependency.csv', header=1, index_col=0)
s2 = pd.read_csv('supplementary/201123_s2__Construct.csv', header=1, index_col=0).fillna('')
s3 = pd.read_csv('supplementary/201012_s3_input.csv', header=1, index_col=0)
s4 = pd.read_csv('supplementary/201012_s4_cleavage-product.csv', header=1)
s5 = pd.read_csv('supplementary/201012_s5_pri-structure.csv', header=1, index_col=0)

In [44]:
preposition = {}
for mir in s2.index:
    flank5, flank3 = s2.loc[mir,["5' flanking segment","3' flanking segment"]]
    if flank5 or flank3:
        preposition[mir] = (len(flank5)+1,125-len(flank3))
    else:
        preposition[mir] = (0,126)
print len(preposition), preposition['hsa-mir-142']

1881 (32, 90)


In [8]:
inpsum = pd.DataFrame()
for inp in ['set1','set3','set5']:
    inpsum[inp] = s3[['%s-1'%inp,'%s-2'%inp]].sum(axis=1)
inpsum['set2'] = s3['set2']
inpsum['set4'] = s3['set4'] 
inpsum = inpsum[LIBRARIES]
inpsum.head(1)

Unnamed: 0_level_0,set1,set2,set3,set4,set5
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hsa-let-7a-1,1373,474,0,2,0


In [9]:
clvraw = s4.set_index(['Pri-miRNA','rstart','rend'])
clvtbl = pd.DataFrame()
for lib in LIBRARIES:
    clvtbl[lib] = clvraw[['%s-1'%lib,'%s-2'%lib]].sum(axis=1)
clvtbl = clvtbl.reset_index()
clvtbl.head(1)

Unnamed: 0,Pri-miRNA,rstart,rend,set1,set2,set3,set4,set5
0,hsa-let-7a-1,1,8,4,4,0,0,0


In [10]:
start, mid, end = 1, 63, 125
clv5f = clvtbl[(clvtbl['rstart']==start)&(clvtbl['rend']<mid)].set_index('Pri-miRNA')
clv3f = clvtbl[(clvtbl['rstart']>mid)&(clvtbl['rend']==end)].set_index('Pri-miRNA')
clvhn = clvtbl[(clvtbl['rstart']<mid)&(clvtbl['rend']>mid)]
clvn5 = clvhn[clvhn['rstart']==start]
clvn3 = clvhn[clvhn['rend']==end]
clvh = clvhn.drop(clvn5.index).drop(clvn3.index).set_index('Pri-miRNA')
clvn5 = clvn5.set_index('Pri-miRNA'); clvn3 = clvn3.set_index('Pri-miRNA')

In [13]:
def fill_unfound(tbl):
    found = set(tbl.index)
    unfound = [ mir for mir in s2.index if mir not in found ]
    fill = pd.DataFrame(index=unfound, columns=tbl.columns).fillna(0)
    return tbl.append(fill)

In [14]:
clvtbl = fill_unfound(clvtbl.set_index('Pri-miRNA'))
clv5f = fill_unfound(clv5f)
clv3f = fill_unfound(clv3f)
clvh = fill_unfound(clvh)
clvn5 = fill_unfound(clvn5)
clvn3 = fill_unfound(clvn3)

---

### Efficiency

In [61]:
RCUTOFF = 30
RCUTOFFH = 10
FCUTOFF = .01
WINDOW = 3
INVERTED = 11

In [19]:
def select_lib(mir):
    return max([lib for lib in LIBRARIES if inpsum.loc[mir,lib]>=RCUTOFF], 
               key=lambda x: clvtbl.loc[mir,x].sum())

In [35]:
def get_clv_cnts(mir, lib):
    ps, pe = preposition[mir]
    if ps<1 or pe>125:
        return -1,-1
    invs = ps+INVERTED
    inve = pe-INVERTED
    clv5 = clv5f.loc[[mir]].reset_index().set_index('rend')[lib]
    clv3 = clv3f.loc[[mir]].reset_index().set_index('rstart')[lib]
    prod5 = clv5.reindex(range(ps-1-WINDOW,ps+WINDOW)).sum()
    prod3 = clv3.reindex(range(pe+1-WINDOW,pe+2+WINDOW)).sum()
    inv5 = clv5.reindex(range(invs-1-WINDOW,invs+WINDOW)).sum()
    inv3 = clv3.reindex(range(inve+1-WINDOW,inve+2+WINDOW)).sum()
    return int((prod5+prod3)/2), int((inv5+inv3)/2)

In [36]:
def calculate_efficiency(mir, lib):
    prod, inv = clvcnts[mir]
    if prod<0: 
        return -999
    inp = inpsum[lib][mir]
    return max(prod-inv,0)/inp

In [16]:
suffinps = list(inpsum[(inpsum>=RCUTOFF).any(axis=1)].index)
print len(suffinps)

1819


In [20]:
sufflibs = { mir:select_lib(mir) for mir in suffinps }
print sufflibs['hsa-let-7a-1']

set1


In [21]:
## Initial substrates
inpcnts = { mir:inpsum.loc[mir,sufflibs[mir]] for mir in suffinps }
print inpcnts['hsa-let-7a-1']

1373


In [46]:
## Cleavage product
clvcnts = { mir:get_clv_cnts(mir,sufflibs[mir]) for mir in suffinps }
print clvcnts['hsa-let-7a-1']

(16552, 11)


In [47]:
## norm
norm = 'hsa-mir-6788'
norms = { lib:calculate_efficiency(norm, lib) for lib in LIBRARIES }
norms

{'set1': 0.26321974148061106,
 'set2': 0.2248995983935743,
 'set3': 0.08839779005524862,
 'set4': 0.109642682329907,
 'set5': 0.03404255319148936}

In [48]:
testset = [ 'hsa-mir-144', 'hsa-let-7a-1', 'hsa-mir-17', 'hsa-mir-200a', 'hsa-mir-15a' ]
for mir in testset:
    eff = calculate_efficiency(mir, sufflibs[mir])
    print '%s\t%.3f'%(mir,np.log2(eff/norms[sufflibs[mir]]+1))

hsa-mir-144	5.475
hsa-let-7a-1	5.547
hsa-mir-17	3.313
hsa-mir-200a	0.000
hsa-mir-15a	0.937


In [49]:
effs = {}
for mir in suffinps:
    eff = calculate_efficiency(mir,sufflibs[mir])
    if eff>-999:
        effs[mir] = np.log2(eff/norms[sufflibs[mir]]+1)
print len(effs), effs['hsa-mir-144']

1816 5.475420236784626


---

### Homogeneity

In [62]:
def get_frac_5frag(mir):
    sub = clv5f.loc[[mir]].set_index('rend')
    return (sub/sub.sum())

In [63]:
def get_frac_3frag(mir):
    sub = clv3f.loc[[mir]].set_index('rstart')
    return (sub/sub.sum())

In [64]:
def get_frac_hairpin(mir):
    sub = clvh.loc[[mir]].set_index(['rstart','rend'])
    return (sub/sub.sum())

In [65]:
def filter_frac(frac):
    return frac[frac>=FCUTOFF]

In [67]:
def drop_duplicates(sortedtbl):
    if len(sortedtbl)<=1:
        return sortedtbl
    hs = sortedtbl['hstart'].tolist()[0]
    he = sortedtbl['hend'].tolist()[0]
    sub = sortedtbl.iloc[1:]
    sub = sub[(sub['hstart']!=hs)&(sub['hend']!=he)]
    return sortedtbl.iloc[:1].append(drop_duplicates(sub))

In [69]:
frac5f = { mir:get_frac_5frag(mir) for mir in s2.index } 
frac3f = { mir:get_frac_3frag(mir) for mir in s2.index } 
frachpn = { mir:get_frac_hairpin(mir) for mir in s2.index }

In [71]:
cols = 'miRNA pstart pend hstart hend 5frag 3frag hairpin score diff5 diff3'.split()

In [94]:
def get_dcs(mir, lib):
    ps, pe = preposition[mir]
    null = pd.DataFrame({0:dict(zip(COLS,[mir,ps,pe,1,125,0,0,0,0,99,99]))}).T[COLS]
    cs = null
    sum5f = clv5f.loc[mir,lib].sum()
    sum3f = clv3f.loc[mir,lib].sum()
    sumh = clvh.loc[mir,lib].sum()
    if not (sum5f>=RCUTOFF and sum3f>=RCUTOFF and sumh>=RCUTOFFH):
        return null
    
    clv5, clv3, clvhpn = [ f[mir][lib] for f in [frac5f,frac3f,frachpn] ]
    clv52 = defaultdict(float); clv52.update(clv5)
    clv32 = defaultdict(float); clv32.update(clv3)
    clvhpn2 = defaultdict(float); clvhpn2.update(clvhpn)
    for c5, frac5 in filter_frac(clv5).items():
        for c3, frac3 in filter_frac(clv3).items():
            row = dict(zip(COLS,[mir,ps,pe,c5+1,c3-1,frac5,frac3,clvhpn2[(c5+1,c3-1)]]))
            cs = cs.append(row, ignore_index=True)
    for (c5,c3), frach in filter_frac(clvhpn).items():
        row = dict(zip(COLS,[mir,ps,pe,c5,c3,clv52[c5-1],clv32[c3+1],frach]))
        cs = cs.append(row, ignore_index=True)
    cs['score'] = (cs['5frag']+cs['hairpin']+cs['3frag'])/3
    cs['diff5'] = cs['hstart']-cs['pstart']
    cs['diff3'] = cs['pend']-cs['hend']
    cs = cs[(abs(cs['diff5'])<=WINDOW)&(abs(cs['diff3'])<=WINDOW)]
    if len(cs)>=1:
        return drop_duplicates(cs.sort_values('score').iloc[::-1])
    return null

In [95]:
mir = 'hsa-mir-10a'
get_dcs(mir, sufflibs[mir])

Unnamed: 0,miRNA,pstart,pend,hstart,hend,5frag,3frag,hairpin,score,diff5,diff3
43,hsa-mir-10a,30,92,30,92,0.826975,0.716722,0.191431,0.578376,0,0
46,hsa-mir-10a,30,92,31,91,0.0371823,0.120155,0.0492252,0.0688542,1,1
37,hsa-mir-10a,30,92,29,93,0.0541945,0.045072,0.0519599,0.0504088,-1,-1


In [96]:
print time.ctime()
clvall = pd.DataFrame(columns=cols)
for mir in suffinps:
    clvall = clvall.append(get_dcs(mir,sufflibs[mir]), ignore_index=True)
clvall = clvall[cols]
print time.ctime()
print '# of all sites:\t%s' % len(clvall)
print '# of miRNAs:\t%s' % len(set(clvall['miRNA']))

Mon Nov 23 15:31:45 2020
Mon Nov 23 15:53:37 2020
# of all sites:	4082
# of miRNAs:	1819


In [97]:
clvall.to_csv('resources/201123_cleavage_sites.csv')
#clvall = pd.read_csv('resources/201123_cleavage_sites.csv', index_col=0)

In [116]:
tmp = clvall.sort_values('score').drop_duplicates('miRNA',keep='last')
homs = tmp.set_index('miRNA')['score'].to_dict()

---

### Build table

In [111]:
homcut = .25
clvsig = clvall[clvall['score']>=homcut]
clvcnt = Counter(clvsig['miRNA'])

In [118]:
effcut = 2.5
overlap = [ m for m in suffinps if m in homs and m in effs ]
single = [ m for m in overlap if effs[m]>=effcut and clvcnt[m]==1 ]
alternative = [ m for m in overlap if effs[m]>=effcut and clvcnt[m]>=2 ]
print len(single), len(alternative)

493 41


In [115]:
cols = [ 'Pri-miRNA', 'Cleavage Efficiency', 'Cleavage Homogeneity',
         "5' miRBase site", "3' miRBase site", "5' cleavage site", "3' cleavage site", 
         "5' alternative site", "3' alternative site", "Cleavage ratio of alternative site" ]
tbl = pd.DataFrame(columns=cols).set_index('Pri-miRNA')
tbl.head(1)

Unnamed: 0_level_0,Cleavage Efficiency,Cleavage Homogeneity,5' miRBase site,3' miRBase site,5' cleavage site,3' cleavage site,5' alternative site,3' alternative site,Alternative Processing Ratio
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [131]:
clvall[clvall['miRNA']=='hsa-let-7a-2']

Unnamed: 0,miRNA,pstart,pend,hstart,hend,5frag,3frag,hairpin,score,diff5,diff3
2,hsa-let-7a-2,28,94,28,94,0.760777,0.94489,0.658254,0.787974,0,0


In [132]:
for pri in overlap:
    ps, pe = preposition[pri]
    tbl.loc[pri,'Cleavage Efficiency'] = effs[pri]
    tbl.loc[pri,'Cleavage Homogeneity'] = homs[pri]
    tbl.loc[pri,"5' miRBase site"] = ps
    tbl.loc[pri,"3' miRBase site"] = pe
    sub = clvall[clvall['miRNA']==pri].sort_values('score',ascending=False)
    sub = sub[sub['score']>0]
    if len(sub)>=1:
        tbl.loc[pri,"5' cleavage site"] = sub['hstart'].values[0]
        tbl.loc[pri,"3' cleavage site"] = sub['hend'].values[0]
        if len(sub)>=2:
            tbl.loc[pri,"5' alternative site"] = sub['hstart'].values[1]
            tbl.loc[pri,"3' alternative site"] = sub['hend'].values[1]
            tbl.loc[pri,"Cleavage ratio of alternative site"] = sub['score'].values[1]

In [135]:
tbl.to_csv('resources/201123_s3__DROSHA_processing_result.csv')

In [136]:
out = open('supplementary/201123_s3__DROSHA_processing_result.csv', 'wt')
description = 'Supplementary Table 3. DROSHA processing result\n\n\n\n\n'
out.write(description)
for l in open('resources/201123_s3__DROSHA_processing_result.csv', 'rt'):
    out.write(l)
out.close()