# Supplementary Table 7. DROSHA processing with SRSF3
---

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-11-25.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
utilpath = '/casa/bsc/notebooks/'
import sys; sys.path.append(utilpath)
from util import *

In [4]:
from basic import gen_result_dir
resultpath = gen_result_dir('results/')
print 'resultpath:', resultpath

resultpath: results/201125/


In [5]:
from __future__ import division
import pandas as pd
import numpy as np
import re
import gzip
from scipy.stats import pearsonr, spearmanr
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
%matplotlib inline

In [6]:
s1 = pd.read_csv('supplementary/201123_s1__DROSHA_dependency.csv', header=1, index_col=0)
s2 = pd.read_csv('supplementary/201123_s2__Construct.csv', header=1, index_col=0).fillna('')

In [7]:
preposition = {}
for mir in s2.index:
    flank5, flank3 = s2.loc[mir,["5' flanking segment","3' flanking segment"]]
    if flank5 or flank3:
        preposition[mir] = (len(flank5)+1,125-len(flank3))
    else:
        preposition[mir] = (0,126)
print len(preposition), preposition['hsa-mir-142']

1881 (32, 90)


In [8]:
tmp = pd.read_csv('supplementary/201012_s2_pri-construct.csv', index_col=0, header=1)
redund = tmp[tmp['Note'].apply(lambda x: str(x).find('Same construct')>=0)]
repreRedund = defaultdict(list)
for pri, note in dict(redund['Note']).items():
    repre = note.split()[-1]
    repreRedund[repre].append(pri)
print len(repreRedund)

25


In [96]:
constseqs = dict(s2['Construct sequence (125 nt)'])
print len(constseqs)

1881


---
### Cleavage Efficiency

In [14]:
datapath = '../2009_srsf3_hiseq/alignments'
RCUTOFF = 30

In [10]:
#inptbl.to_csv('%s/input_counts.csv'%datapath)
inptbl = pd.read_csv('%s/input_counts.csv'%datapath,index_col=0)

In [11]:
for pri in inptbl.index:
    if pri in repreRedund:
        for pri2 in repreRedund[pri]:
            inptbl.loc[pri2] = inptbl.loc[pri]
inptbl = inptbl.sort_index()

In [12]:
inptbl.head(3)

Unnamed: 0_level_0,input_1,input_2
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1
hsa-let-7a-1,6675,6846
hsa-let-7a-2,15458,17160
hsa-let-7a-3,29955,27010


In [16]:
suffinps = sorted([ mir for mir in s2.index if inptbl.loc[mir].sum()>=RCUTOFF ])
print len(suffinps)

1815


In [17]:
#clvtbl.to_csv('%s/cleaved_count.csv'%datapath)
clvtbl = pd.read_csv('%s/cleaved_count.csv'%datapath,index_col=0)
clvtbl.head()

Unnamed: 0,Pri-miRNA,rstart,rend,cleaved_1,cleaved_2
0,hsa-let-7a-1,1,7,0,5
1,hsa-let-7a-1,1,8,0,3
2,hsa-let-7a-1,1,9,21,19
3,hsa-let-7a-1,1,10,12,27
4,hsa-let-7a-1,1,11,44,39


In [18]:
start, mid, end = 1, 63, 125
clv5f = clvtbl[(clvtbl['rstart']==start)&(clvtbl['rend']<mid)].set_index('Pri-miRNA')
clv3f = clvtbl[(clvtbl['rstart']>mid)&(clvtbl['rend']==end)].set_index('Pri-miRNA')
clvhn = clvtbl[(clvtbl['rstart']<mid)&(clvtbl['rend']>mid)]
clvn5 = clvhn[clvhn['rstart']==start]
clvn3 = clvhn[clvhn['rend']==end]
clvh = clvhn.drop(clvn5.index).drop(clvn3.index).set_index('Pri-miRNA')

In [26]:
WINDOW = 3
INVERTED = 11

In [20]:
inptbl['input'] = inptbl['input_1']+inptbl['input_2']
clvtbl['cleaved'] = clvtbl['cleaved_2']#+clvtbl['cleaved_2']

In [21]:
start, mid, end = 1, 63, 125
clv5f = clvtbl[(clvtbl['rstart']==start)&(clvtbl['rend']<mid)].set_index('Pri-miRNA')
clv3f = clvtbl[(clvtbl['rstart']>mid)&(clvtbl['rend']==end)].set_index('Pri-miRNA')
clvhn = clvtbl[(clvtbl['rstart']<mid)&(clvtbl['rend']>mid)]
clvn5 = clvhn[clvhn['rstart']==start]
clvn3 = clvhn[clvhn['rend']==end]
clvh = clvhn.drop(clvn5.index).drop(clvn3.index).set_index('Pri-miRNA')
clvn5 = clvn5.set_index('Pri-miRNA'); clvn3 = clvn3.set_index('Pri-miRNA')

In [24]:
def fill_unfound(tbl):
    found = set(tbl.index)
    unfound = [ mir for mir in s2.index if mir not in found ]
    fill = pd.DataFrame(index=unfound, columns=tbl.columns).fillna(0)
    return tbl.append(fill)

In [25]:
clvtbl = fill_unfound(clvtbl.set_index('Pri-miRNA'))
clv5f = fill_unfound(clv5f)
clv3f = fill_unfound(clv3f)
clvh = fill_unfound(clvh)
clvn5 = fill_unfound(clvn5)
clvn3 = fill_unfound(clvn3)

In [104]:
flank5 = { m:constseqs[m][:preposition[m][0]-1] for m in suffinps }
f5cnts = Counter(flank5.values())
dupl5 = [ m for m in flank5 if f5cnts[flank5[m]]>1 ]

In [105]:
def get_clv_cnts(mir, lib):
    ps, pe = preposition[mir]
    if ps<1 or pe>125:
        return -1,-1
    invs = ps+INVERTED
    inve = pe-INVERTED
    clv5 = clv5f.loc[[mir]].reset_index().set_index('rend')[lib]
    clv3 = clv3f.loc[[mir]].reset_index().set_index('rstart')[lib]
    prod5 = clv5.reindex(range(ps-1-WINDOW,ps+WINDOW)).sum()
    prod3 = clv3.reindex(range(pe+1-WINDOW,pe+2+WINDOW)).sum()
    inv5 = clv5.reindex(range(invs-1-WINDOW,invs+WINDOW)).sum()
    inv3 = clv3.reindex(range(inve+1-WINDOW,inve+2+WINDOW)).sum()
    if mir in dupl5:
        return prod3, inv3
    return prod5, inv5

In [106]:
clvcnts = { mir:get_clv_cnts(mir,'cleaved') for mir in suffinps }
print clvcnts['hsa-let-7a-1']

(44460, 66.0)


In [107]:
def calculate_efficiency(mir):
    prod, inv = clvcnts[mir]
    if prod<0: 
        return -999
    inp = inptbl.loc[mir,'input']
    return max(prod-inv,0)/inp

In [108]:
## norm
norm = 'hsa-mir-6788'
norms = calculate_efficiency(norm)
print norms

0.0010190711894016596


In [109]:
effs = {}
for mir in suffinps:
    eff = calculate_efficiency(mir)
    if eff>-999:
        effs[mir] = np.log2(eff/norms+1)
print len(effs), effs['hsa-mir-144']

1812 11.238305120172882


---
### Cleavage Homogeneity

In [44]:
RCUTOFF = 30
RCUTOFFH = 10
FCUTOFF = .01
WINDOW = 3
INVERTED = 11

In [37]:
def get_frac_5frag(mir):
    sub = clv5f.loc[[mir]].set_index('rend')
    return (sub/sub.sum())

In [38]:
def get_frac_3frag(mir):
    sub = clv3f.loc[[mir]].set_index('rstart')
    return (sub/sub.sum())

In [39]:
def get_frac_hairpin(mir):
    sub = clvh.loc[[mir]].set_index(['rstart','rend'])
    return (sub/sub.sum())

In [50]:
def filter_frac(frac):
    return frac[frac>=FCUTOFF]

In [41]:
def add_specificity(row):
    row['score'] = (row['5frag']+row['hairpin']+row['3frag'])/3
    return row

In [42]:
def drop_duplicates(sortedtbl):
    if len(sortedtbl)<=1:
        return sortedtbl
    hs = sortedtbl['hstart'].tolist()[0]
    he = sortedtbl['hend'].tolist()[0]
    sub = sortedtbl.iloc[1:]
    sub = sub[(sub['hstart']!=hs)&(sub['hend']!=he)]
    return sortedtbl.iloc[:1].append(drop_duplicates(sub))

In [43]:
frac5f = { mir:get_frac_5frag(mir) for mir in s2.index } 
frac3f = { mir:get_frac_3frag(mir) for mir in s2.index } 
frachpn = { mir:get_frac_hairpin(mir) for mir in s2.index }

In [45]:
COLS = 'miRNA pstart pend hstart hend 5frag 3frag hairpin score diff5 diff3'.split()

In [48]:
def get_dcs(mir, lib):
    ps, pe = preposition[mir]
    null = pd.DataFrame({0:dict(zip(COLS,[mir,ps,pe,1,125,0,0,0,0,99,99]))}).T[COLS]
    cs = null
    sum5f = clv5f.loc[mir,lib].sum()
    sum3f = clv3f.loc[mir,lib].sum()
    sumh = clvh.loc[mir,lib].sum()
    if not (sum5f>=RCUTOFF and sum3f>=RCUTOFF and sumh>=RCUTOFFH):
        return null
    
    clv5, clv3, clvhpn = [ f[mir][lib] for f in [frac5f,frac3f,frachpn] ]
    clv52 = defaultdict(float); clv52.update(clv5)
    clv32 = defaultdict(float); clv32.update(clv3)
    clvhpn2 = defaultdict(float); clvhpn2.update(clvhpn)
    for c5, frac5 in filter_frac(clv5).items():
        for c3, frac3 in filter_frac(clv3).items():
            row = dict(zip(COLS,[mir,ps,pe,c5+1,c3-1,frac5,frac3,clvhpn2[(c5+1,c3-1)]]))
            cs = cs.append(row, ignore_index=True)
    for (c5,c3), frach in filter_frac(clvhpn).items():
        row = dict(zip(COLS,[mir,ps,pe,c5,c3,clv52[c5-1],clv32[c3+1],frach]))
        cs = cs.append(row, ignore_index=True)
    cs['score'] = (cs['5frag']+cs['hairpin']+cs['3frag'])/3
    cs['diff5'] = cs['hstart']-cs['pstart']
    cs['diff3'] = cs['pend']-cs['hend']
    cs = cs[(abs(cs['diff5'])<=WINDOW)&(abs(cs['diff3'])<=WINDOW)]
    if len(cs)>=1:
        return drop_duplicates(cs.sort_values('score').iloc[::-1])
    return null

In [51]:
mir = 'hsa-mir-10a'
get_dcs(mir, 'cleaved')

Unnamed: 0,miRNA,pstart,pend,hstart,hend,5frag,3frag,hairpin,score,diff5,diff3
14,hsa-mir-10a,30,92,30,92,0.472319,0.751985,0.494226,0.572843,0,0
25,hsa-mir-10a,30,92,29,93,0.217454,0.0896469,0.113164,0.140088,-1,-1
28,hsa-mir-10a,30,92,31,91,0.113288,0.132324,0.145497,0.13037,1,1


In [52]:
print time.ctime()
clvall = pd.DataFrame(columns=COLS)
for mir in suffinps:
    clvall = clvall.append(get_dcs(mir, 'cleaved'), ignore_index=True)
clvall = clvall[COLS]
print time.ctime()
print '# of all sites:\t%s' % len(clvall)
print '# of miRNAs:\t%s' % len(set(clvall['miRNA']))

Wed Nov 25 11:29:46 2020
Wed Nov 25 11:54:43 2020
# of all sites:	4161
# of miRNAs:	1815


In [53]:
clvall.to_csv('resources/201123_cleavage_sites_srsf3.csv')
#clvall = pd.read_csv('resources/201123_cleavage_sites_srsf3.csv',index_col=0)

In [54]:
tmp = clvall.sort_values('score').drop_duplicates('miRNA',keep='last')
homs = tmp.set_index('miRNA')['score'].to_dict()

---
### Build table

In [55]:
homcut = .25
clvsig = clvall[clvall['score']>=homcut]
clvcnt = Counter(clvsig['miRNA'])

In [110]:
effcut = 2.5
overlap = [ m for m in suffinps if m in homs and m in effs ]
single = [ m for m in overlap if effs[m]>=effcut and clvcnt[m]==1 ]
alternative = [ m for m in overlap if effs[m]>=effcut and clvcnt[m]>=2 ]
print len(single), len(alternative)

692 57


In [116]:
cols = [ 'Pri-miRNA', 'Cleavage Efficiency', 'Cleavage Homogeneity',
         "5' miRBase site", "3' miRBase site", "5' cleavage site", "3' cleavage site", 
         "5' alternative site", "3' alternative site", "Cleavage ratio of alternative site",
         '5p nick processing', '3p nick processing', 'Inverted processing' ]
tbl = pd.DataFrame(columns=cols).set_index('Pri-miRNA')
tbl.head(1)

Unnamed: 0_level_0,Cleavage Efficiency,Cleavage Homogeneity,5' miRBase site,3' miRBase site,5' cleavage site,3' cleavage site,5' alternative site,3' alternative site,Cleavage ratio of alternative site,5p nick processing,3p nick processing,Inverted processing
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


In [117]:
for pri in overlap:
    ps, pe = preposition[pri]
    tbl.loc[pri,'Cleavage Efficiency'] = effs[pri]
    tbl.loc[pri,'Cleavage Homogeneity'] = homs[pri]
    tbl.loc[pri,"5' miRBase site"] = ps
    tbl.loc[pri,"3' miRBase site"] = pe
    sub = clvall[clvall['miRNA']==pri].sort_values('score',ascending=False)
    sub = sub[sub['score']>0]
    if len(sub)>=1:
        tbl.loc[pri,"5' cleavage site"] = sub['hstart'].values[0]
        tbl.loc[pri,"3' cleavage site"] = sub['hend'].values[0]
        if len(sub)>=2:
            tbl.loc[pri,"5' alternative site"] = sub['hstart'].values[1]
            tbl.loc[pri,"3' alternative site"] = sub['hend'].values[1]
            tbl.loc[pri,"Cleavage ratio of alternative site"] = sub['score'].values[1]

In [118]:
for mir in overlap:
    ps, pe = preposition[mir]
    if ps<1 or pe>125:
        continue
    unc5 = clvn5.loc[[mir]].reset_index().set_index(['rend'])
    unc5 = unc5.reindex(range(pe-WINDOW,pe+WINDOW+1)).sum()
    unc3 = clvn3.loc[[mir]].reset_index().set_index(['rstart'])
    unc3 = unc3.reindex(range(ps-WINDOW,ps+WINDOW+1)).sum()
    hpn = clvh.loc[[mir]]
    hpn = hpn[hpn.apply(lambda x: (x['rstart'] in range(ps-WINDOW,ps+WINDOW+1)) and 
                                  (x['rend'] in range(pe-WINDOW,pe+WINDOW+1)),axis=1)].sum()
    lib = 'cleaved'
    if hpn[lib].sum()>=RCUTOFFH:
        tbl.loc[mir,'5p nick processing'] = (unc3[lib]+1)/(hpn[lib]+1)
        tbl.loc[mir,'3p nick processing'] = (unc5[lib]+1)/(hpn[lib]+1)

In [119]:
for mir in overlap:
    ps, pe = preposition[mir]
    if ps<1 or pe>125:
        continue
    hpn = clvh.loc[[mir]]
    prohpn = hpn[hpn.apply(lambda x: (x['rstart'] in range(ps-WINDOW,ps+WINDOW+1)) and 
                                     (x['rend'] in range(pe-WINDOW,pe+WINDOW+1)),axis=1)].sum()
    invhpn = hpn[hpn.apply(lambda x: (x['rstart'] in range(ps+INVERTED-WINDOW,
                                                           ps+INVERTED+WINDOW+1)) and 
                                     (x['rend'] in range(pe-INVERTED-WINDOW,
                                                         pe-INVERTED+WINDOW+1)),axis=1)].sum()
    lib = 'cleaved'
    if prohpn[lib]>=RCUTOFFH:
        tbl.loc[mir,'Inverted processing'] = (invhpn[lib]+1)/(prohpn[lib]+1)

In [120]:
tbl.head()

Unnamed: 0_level_0,Cleavage Efficiency,Cleavage Homogeneity,5' miRBase site,3' miRBase site,5' cleavage site,3' cleavage site,5' alternative site,3' alternative site,Cleavage ratio of alternative site,5p nick processing,3p nick processing,Inverted processing
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
hsa-let-7a-1,11.6541,0.870736,25,96,25,96,,,,0.00125471,0.0276035,0.00752823
hsa-let-7a-2,11.3244,0.767239,28,94,28,94,27.0,95.0,0.0257018,0.109134,0.0118624,0.622183
hsa-let-7a-3,8.74483,0.636835,26,94,26,95,25.0,94.0,0.101442,0.58836,0.00952381,0.0126984
hsa-let-7b,10.4792,0.753999,23,98,23,97,,,,0.0136752,0.326496,0.0358974
hsa-let-7c,9.33761,0.666916,27,93,27,93,26.0,94.0,0.0336991,0.0441941,0.373484,1.14385


In [121]:
tbl.to_csv('resources/201123_s7__DROSHA_processing_with_SRSF3.csv')

In [122]:
out = open('supplementary/201123_s7__DROSHA_processing_with_SRSF3.csv', 'wt')
description = 'Supplementary Table 7. DROSHA processing with SRSF3\n\n\n\n\n'
out.write(description)
for l in open('resources/201123_s7__DROSHA_processing_with_SRSF3.csv', 'rt'):
    out.write(l)
out.close()