# Supplementary Table 8. Microprocessor IVP with SRSF3
1. List up pri-miRNAs with redundant construct sequence
2. Build table
3. Add redundant pri-miRNAs

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-10-13.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
utilpath = '/casa/bsc/notebooks/'
import sys; sys.path.append(utilpath)
from util import *

In [4]:
from basic import gen_result_dir
resultpath = gen_result_dir('results/')
print 'resultpath:', resultpath

resultpath: results/201013/


In [5]:
from __future__ import division
import pandas as pd
import numpy as np
import re
import gzip
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
%matplotlib inline

### 1. List up pri-miRNAs with redundant construct sequence

In [6]:
s1 = pd.read_csv('supplementary/201012_s1_pri-info.csv', header=1, index_col=0)
s2 = pd.read_csv('supplementary/201012_s2_pri-construct.csv', index_col=0, header=1)
s6 = pd.read_csv('supplementary/201012_s6_cleavage-patterns.csv', header=1, index_col=0)
print 's1:\t%s'%', '.join(list(s1.columns))
print 's2:\t%s'%', '.join(list(s2.columns)[:6])
print 's6:\t%s'%', '.join(list(s6.columns)[-4:])

s1:	5p mature, 5p sequence, 3p mature, 3p sequence, Note
s2:	Chr, Start, End, Strand, Construct sequence, 100way phyloP scores (pre-miRNA -/+ 100nt)
s6:	Relative position from miRBase site, Cleavage Specificity, Cleavage Imbalance, Cleavage type


In [7]:
singlemp = set(s6[s6['Cleavage type']=='single'].index)
multimp = set(s6[s6['Cleavage type']=='multiple'].index)
invertedmp = set(s6[s6['Cleavage type']=='inverted'].index)
nick5mp = set(s6[s6['Cleavage type']=='nick5'].index)
nick3mp = set(s6[s6['Cleavage type']=='nick3'].index)
nickmp = list(nick5mp) + list(nick3mp)
nonspecmp = set(s6[s6['Cleavage type']=='non-specific'].index)
dcsmirsmp = list(singlemp) + list(multimp)
nodcsmp = list(invertedmp) + list(nickmp) + list(nonspecmp)
print 'Productive: %s, Unproductive: %s' % (len(dcsmirsmp), len(nodcsmp))
print 'Single: %s, Multiple: %s, Inverted: %s, Nick: %s, Non-specific: %s'\
% (len(singlemp), len(multimp), len(invertedmp), len(nickmp), len(nonspecmp))
print "5' nick: %s, 3' nick: %s" % (len(nick5mp), len(nick3mp))

Productive: 512, Unproductive: 1304
Single: 445, Multiple: 67, Inverted: 156, Nick: 107, Non-specific: 1041
5' nick: 72, 3' nick: 35


In [8]:
def get_pre_position(pri):
    constructseq = s2.loc[pri, 'Construct sequence'].replace('T','U')
    seq5p = s1.loc[pri, '5p sequence']
    seq3p = s1.loc[pri, '3p sequence']
    if seq5p=='n.a.' or constructseq.find(seq5p)==-1:
        prestart = 0
    else:
        prestart = constructseq.find(seq5p)+1
    if seq3p=='n.a.' or constructseq.find(seq3p)==-1:
        preend = 0
    else:
        preend = constructseq.rfind(seq3p)+len(seq3p)
    return prestart, preend

In [9]:
allpris = list(s2.index)
preposition = { pri:get_pre_position(pri) for pri in allpris }

In [10]:
redund = s2[s2['Note'].apply(lambda x: str(x).find('Same construct')>=0)]
repreRedund = defaultdict(list)
for pri, note in dict(redund['Note']).items():
    repre = note.split()[-1]
    repreRedund[repre].append(pri)
print len(repreRedund)

25


### 2. Input

In [11]:
rcutoff = 30

In [12]:
allinputs = [ 'input' ]
inptbl = pd.DataFrame(columns=['Pri-miRNA']+allinputs)
inptbl['Pri-miRNA'] = allpris
inptbl = inptbl.set_index('Pri-miRNA').fillna(0)

In [13]:
for lib in allinputs:
    txt = 'srsf3/alignments/%s.txt.gz' % lib
    mirs = !zcat $txt | cut -d" " -f1
    adjmirs = [ 'hsa-mir-1302-11' if m=='hsa-mir-1302-2' else m for m in mirs ]
    mircnts = pd.Series(Counter(adjmirs))
    inptbl[lib].update(mircnts)

In [14]:
for pri in inptbl.index:
    if pri in repreRedund:
        for pri2 in repreRedund[pri]:
            inptbl.loc[pri2] = inptbl.loc[pri]

In [15]:
inptbl.head(3)

Unnamed: 0_level_0,input
Pri-miRNA,Unnamed: 1_level_1
hsa-let-7a-1,706
hsa-let-7a-2,2225
hsa-let-7a-3,2916


In [16]:
rcutoff = 30
suffinps = sorted([ mir for mir in allpris if inptbl.loc[mir, 'input']>=rcutoff ])
print len(suffinps)

1749


### 3. Cleavage products

In [17]:
def cigar_to_len(cigar):
    nums = re.split('[A-Z]', cigar)[:-1]
    strs = re.split('[0-9]+', cigar)[1:]
    rlen = sum([int(number) for number, cigStr in zip(nums, strs) if cigStr in 'DM'])
    return rlen

In [18]:
allclvs = 'cleaved-1 cleaved-2'.split()
adaptlen = 20

In [19]:
d = {}
for lib in allclvs:
    summaryfile = 'srsf3/alignments/%s.txt.gz' % lib
    tbl = pd.read_table(gzip.open(summaryfile, 'rb'), sep=' ',
                        names=['Pri-miRNA', 'start_R2', 'cigar', 'start_R1'])
    tbl['rstart'] = tbl['start_R1'].apply(lambda x: max(1, x-adaptlen))
    tbl['rlen'] = tbl['cigar'].apply(cigar_to_len)
    tbl['rend_tmp'] = tbl['start_R2']+tbl['rlen']-1
    tbl['rend'] = tbl['rend_tmp'].apply(lambda x: min(125, x-adaptlen))
    cnttbl = tbl.groupby(['Pri-miRNA', 'rstart', 'rend']).size()
    d[lib] = cnttbl

In [20]:
clvtbl = pd.DataFrame(d).fillna(0).astype(int).reset_index()
unclv = clvtbl[(clvtbl['rstart']==1)&(clvtbl['rend']==125)]
clvtbl = clvtbl.drop(unclv.index)

In [21]:
for pri in set(clvtbl['Pri-miRNA']):
    if pri in repreRedund:
        for pri2 in repreRedund[pri]:
            new = clvtbl[clvtbl['Pri-miRNA']==pri].copy()
            new['Pri-miRNA'] = pri2
            clvtbl = clvtbl.append(new)
clvtbl = clvtbl.sort_values(['Pri-miRNA', 'rstart', 'rend'])
print len(set(clvtbl['Pri-miRNA']))
clvtbl.head()

1876


Unnamed: 0,Pri-miRNA,rstart,rend,cleaved-1,cleaved-2
0,hsa-let-7a-1,1,7,1,0
1,hsa-let-7a-1,1,10,2,1
2,hsa-let-7a-1,1,11,1,0
3,hsa-let-7a-1,1,12,0,2
4,hsa-let-7a-1,1,13,24,17


### 4. Cleavage productivity

In [22]:
window = 3

In [23]:
clvtbl['cleaved'] = clvtbl['cleaved-1']+clvtbl['cleaved-2']

In [24]:
start, mid, end = 1, 63, 125
clv5f = clvtbl[(clvtbl['rstart']==start)&(clvtbl['rend']<mid)].set_index('Pri-miRNA')
clv3f = clvtbl[(clvtbl['rstart']>mid)&(clvtbl['rend']==end)].set_index('Pri-miRNA')
clvhn = clvtbl[(clvtbl['rstart']<mid)&(clvtbl['rend']>mid)]
clvn5 = clvhn[clvhn['rstart']==start]
clvn3 = clvhn[clvhn['rend']==end]
clvh = clvhn.drop(clvn5.index).drop(clvn3.index).set_index('Pri-miRNA')
clvn5 = clvn5.set_index('Pri-miRNA'); clvn3 = clvn3.set_index('Pri-miRNA')

In [25]:
def fill_unfound(tbl):
    found = set(tbl.index)
    unfound = [ mir for mir in allpris if mir not in found ]
    fill = pd.DataFrame(index=unfound, columns=tbl.columns).fillna(0)
    return tbl.append(fill)

In [26]:
clvtbl = fill_unfound(clvtbl.set_index('Pri-miRNA'))
clv5f = fill_unfound(clv5f)
clv3f = fill_unfound(clv3f)
clvh = fill_unfound(clvh)
clvn5 = fill_unfound(clvn5)
clvn3 = fill_unfound(clvn3)

In [27]:
def get_clv_all(mir):
    clv5 = clv5f.loc[mir, 'cleaved']
    clv3 = clv3f.loc[mir, 'cleaved']
    return int(round((clv5.sum()+clv3.sum())/2, 0))

In [28]:
clvcnts = { mir:get_clv_all(mir) for mir in suffinps }
print clvcnts['hsa-let-7a-1']

6676


In [29]:
def get_clv_prod(mir, window):
    ps, pe = preposition[mir]
    if ps==0 and pe==0:
        return -1
    clv5 = clv5f.loc[[mir]].reset_index().set_index('rend')['cleaved']
    clv3 = clv3f.loc[[mir]].reset_index().set_index('rstart')['cleaved']
    if ps and pe:
        prod5 = clv5.reindex(range(ps-1-window,ps+window)).sum()
        prod3 = clv3.reindex(range(pe+1-window,pe+2+window)).sum()
        prod = round((prod5+prod3)/2, 0)
    elif ps:
        prod = clv5.reindex(range(ps-1-window,ps+window)).sum()
    else:
        prod = clv3.reindex(range(pe+1-window,pe+2+window)).sum()
    return int(prod)

In [30]:
prodcnts = { mir:get_clv_prod(mir,window) for mir in suffinps }
print prodcnts['hsa-let-7a-1']

6608


In [31]:
def get_clv_inv(mir, window):
    ps, pe = preposition[mir]
    if ps==0 and pe==0:
        return -1
    invs = ps+11
    inve = pe-11
    clv5 = clv5f.loc[[mir]].reset_index().set_index('rend')['cleaved']
    clv3 = clv3f.loc[[mir]].reset_index().set_index('rstart')['cleaved']
    if ps and pe:
        prod5 = clv5.reindex(range(invs-1-window,invs+window)).sum()
        prod3 = clv3.reindex(range(inve+1-window,inve+2+window)).sum()
        prod = round((prod5+prod3)/2, 0)
    elif ps:
        prod = clv5.reindex(range(invs-1-window,invs+window)).sum()
    else:
        prod = clv3.reindex(range(inve+1-window,inve+2+window)).sum()
    return int(prod)

In [32]:
invcnts = { mir:get_clv_inv(mir,window) for mir in suffinps }
print invcnts['hsa-let-7a-1']

8


In [33]:
def get_productivity(mir, window):
    clv = get_clv_all(mir)
    prod = get_clv_prod(mir, window)
    if prod<0: return -999
    if clv==0: return 0
    inp = inptbl.loc[mir, 'input']
    return (prod/inp)*(prod/clv)

In [34]:
## norm
norm = 'hsa-mir-6788'
norms = get_productivity(norm, window)
print norms

0.00794248445143798


In [35]:
normprods = {}
for mir in suffinps:
    prod = get_productivity(mir, window)
    if prod>-999:
        normprods[mir] = np.log2(prod/norms+1)

### 5. Cleavage specificity

In [37]:
rcutoff = 30
fcutoff = .01

In [38]:
def get_frac_5frag(mir):
    sub = clv5f.loc[[mir]].set_index('rend')
    return (sub/sub.sum())

In [39]:
def get_frac_3frag(mir):
    sub = clv3f.loc[[mir]].set_index('rstart')
    return (sub/sub.sum())

In [40]:
def get_frac_hairpin(mir):
    sub = clvh.loc[[mir]].set_index(['rstart','rend'])
    return (sub/sub.sum())

In [41]:
def filter_frac(frac, fcutoff):
    return frac[frac>=fcutoff]

In [42]:
def add_specificity(row):
    row['cleavage specificity'] = np.log2((row['5frag']+row['hairpin']+row['3frag'])/3+1)
    return row

In [43]:
def add_diff(row):
    if row['pstart'] and row['pend']:
        row['diff'] = min([row['hstart']-row['pstart'], row['pend']-row['hend']], key=abs)
    elif row['pstart']:
        row['diff'] = row['hstart'] - row['pstart'] 
    else:
        row['diff'] = row['pend'] - row['hend']
    return row

In [44]:
def drop_duplicates(sortedtbl):
    if len(sortedtbl)<=1:
        return sortedtbl
    hs = sortedtbl['hstart'].tolist()[0]
    he = sortedtbl['hend'].tolist()[0]
    sub = sortedtbl.iloc[1:]
    sub = sub[(sub['hstart']!=hs)&(sub['hend']!=he)]
    return sortedtbl.iloc[:1].append(drop_duplicates(sub))

In [45]:
frac5f = { mir:get_frac_5frag(mir) for mir in allpris } 
frac3f = { mir:get_frac_3frag(mir) for mir in allpris } 
frachpn = { mir:get_frac_hairpin(mir) for mir in allpris }

In [46]:
COLS = [ 'miRNA', 'pstart', 'pend', 'hstart', 'hend', '5frag', '3frag', 'hairpin', 
         'cleavage specificity', 'diff' ]
def get_dcs(mir, rcutoff, fcutoff, lib):
    ps, pe = preposition[mir]
    clvsites = pd.DataFrame({0:dict(zip(COLS,[mir,ps,pe,1,125,0,0,0,0,99]))}).T[COLS]
    clv5, clv3, clvhpn = [ f[mir][lib] if c.loc[mir,lib].sum()>=rcutoff else pd.Series()
                           for f,c in zip([frac5f,frac3f,frachpn], [clv5f,clv3f,clvh]) ]
    if sum(map(bool, [clv5.to_dict(),clv3.to_dict(),clvhpn.to_dict()]))<2:
        return clvsites
    clv52 = defaultdict(float); clv52.update(clv5)
    clv32 = defaultdict(float); clv32.update(clv3)
    clvhpn2 = defaultdict(float); clvhpn2.update(clvhpn)
    for c5, frac5 in filter_frac(clv5, fcutoff).items():
        for c3, frac3 in filter_frac(clv3, fcutoff).items():
            row = dict(zip(COLS,[mir,ps,pe,c5+1,c3-1,frac5,frac3,clvhpn2[(c5+1,c3-1)]]))
            clvsites = clvsites.append(row, ignore_index=True)
    for (c5,c3), frach in filter_frac(clvhpn, fcutoff).items():
        row = dict(zip(COLS,[mir,ps,pe,c5,c3,clv52[c5-1],clv32[c3+1],frach]))
        clvsites = clvsites.append(row, ignore_index=True)
    cs = clvsites.apply(add_specificity,axis=1).sort_values('cleavage specificity').iloc[::-1]
    return drop_duplicates(cs).apply(add_diff,axis=1)

In [47]:
get_dcs('hsa-let-7d', rcutoff, fcutoff, 'cleaved')

Unnamed: 0,miRNA,pstart,pend,hstart,hend,5frag,3frag,hairpin,cleavage specificity,diff
11,hsa-let-7d,23,98,23,97,0.855623,0.877309,0.660441,0.846226,0
1,hsa-let-7d,23,98,22,96,0.07135,0.0563,0.00034,0.060273,-1
9,hsa-let-7d,23,98,24,98,0.050041,0.046503,0.001358,0.046329,0
14,hsa-let-7d,23,98,45,76,0.000217,2.4e-05,0.019694,0.009556,22
0,hsa-let-7d,23,98,1,125,0.0,0.0,0.0,0.0,-22


In [48]:
print time.ctime()
clvall = pd.DataFrame(columns=COLS)
for mir in suffinps:
    clvall = clvall.append(get_dcs(mir, rcutoff, fcutoff, 'cleaved'), ignore_index=True)
clvall = clvall[COLS]
print time.ctime()
print '# of all sites:\t%s' % len(clvall)
print '# of miRNAs:\t%s' % len(set(clvall['miRNA']))

Mon Oct 12 15:23:43 2020
Mon Oct 12 15:50:44 2020
# of all sites:	21625
# of miRNAs:	1749


In [36]:
#clvall.to_csv('results/201013/201013_cleavage_specificity_SRSF3.csv')
clvall = pd.read_csv('results/201013/201013_cleavage_specificity_SRSF3.csv', index_col=0)

In [37]:
clvall.head()

Unnamed: 0,miRNA,pstart,pend,hstart,hend,5frag,3frag,hairpin,cleavage specificity,diff
0,hsa-let-7a-1,25,96,25,96,0.938626,0.977131,0.706897,0.906288,0
1,hsa-let-7a-1,25,96,42,80,0.000229,0.0,0.017241,0.008377,16
2,hsa-let-7a-1,25,96,47,75,0.0,0.0,0.017241,0.008268,21
3,hsa-let-7a-1,25,96,1,125,0.0,0.0,0.0,0.0,-24
4,hsa-let-7a-2,28,94,28,94,0.844472,0.959026,0.446768,0.807428,0


### 6. Cleavage type

In [38]:
window = 3
offset = 11
procut = 1.4
specut = .42
altcut = .26
imbcut = .2

In [39]:
overlap = [ mir for mir in set(clvall['miRNA']) if mir in normprods ]
print len(overlap)

1746


In [40]:
def get_dcs_all(procut, specut, window):
    clvpro = clvall[abs(clvall['diff'])<=window]
    clvsig = clvpro[clvpro['cleavage specificity']>=specut]
    return [ mir for mir in set(clvsig['miRNA']) if normprods[mir]>=procut ]

In [41]:
def get_dcs_multi(procut, specut, altcut, window):
    clvpro = clvall[abs(clvall['diff'])<=window]
    clvsig = clvpro[clvpro['cleavage specificity']>=specut]
    clvcnts = Counter(clvpro[clvpro['cleavage specificity']>=altcut]['miRNA'])
    return [ mir for mir in set(clvsig['miRNA']) if normprods[mir]>=procut and clvcnts[mir]>1 ]

In [42]:
def get_dcs_single(procut, specut, altcut, window):
    dcsmirs = get_dcs_all(procut, specut, window)
    multi = get_dcs_multi(procut, specut, altcut, window)
    return [ mir for mir in dcsmirs if mir not in multi ]

In [43]:
def get_invert(specut, offset, window, mirexc):
    clvinv = clvall[abs(clvall['diff']-offset)<=window]
    clvsig = clvinv[clvinv['cleavage specificity']>=specut]
    return [ mir for mir in set(clvsig['miRNA']) if mir not in mirexc ]

In [44]:
def get_nick5(imbcut, window, mirexc):
    clvpro = clvall[(abs(clvall['diff'])<=window)&(clvall['3frag']>0)]
    clvn5 = clvpro[(clvpro['5frag']-clvpro['3frag'])>=imbcut]
    return [ mir for mir in set(clvn5['miRNA']) if mir not in mirexc ]

In [45]:
def get_nick3(imbcut, window, mirexc):
    clvpro = clvall[(abs(clvall['diff'])<=window)&(clvall['5frag']>0)]
    clvn3 = clvpro[(clvpro['3frag']-clvpro['5frag'])>=imbcut]
    return [ mir for mir in set(clvn3['miRNA']) if mir not in mirexc ]

In [46]:
dcsmirs = get_dcs_all(procut, specut, window)
single = get_dcs_single(procut, specut, altcut, window)
multi = get_dcs_multi(procut, specut, altcut, window)

nick5 = get_nick5(imbcut, window, dcsmirs)
nick3 = get_nick3(imbcut, window, dcsmirs+nick5)
nick = list(set(nick5+nick3))

inverted = get_invert(specut, offset, window, dcsmirs+nick)
nodcs = [ mir for mir in overlap if mir not in dcsmirs ]
nonspec = [ mir for mir in nodcs if mir not in inverted+nick ]

print 'Productive: %s, Unproductive: %s' % (len(dcsmirs), len(nodcs))
print 'Single: %s, Multiple: %s, Inverted: %s, Nick: %s, Non-specific: %s'\
% (len(single), len(multi), len(inverted), len(nick), len(nonspec))
print "5' nick: %s, 3' nick: %s" % (len(nick5), len(nick3))

Productive: 681, Unproductive: 1065
Single: 589, Multiple: 92, Inverted: 102, Nick: 52, Non-specific: 911
5' nick: 26, 3' nick: 26


In [47]:
print 'let-7a-1 in unique:', 'hsa-let-7a-1' in single
print 'mir-342 in multiple:', 'hsa-mir-342' in multi
print 'mir-17 in single:', 'hsa-mir-17' in single
print 'mir-15a in single:', 'hsa-mir-15a' in single

let-7a-1 in unique: True
mir-342 in multiple: True
mir-17 in single: True
mir-15a in single: True


In [48]:
clvtypes = {}
clvtypes.update({mir:'single' for mir in single})
clvtypes.update({mir:'multiple' for mir in multi})
clvtypes.update({mir:'inverted' for mir in inverted})
clvtypes.update({mir:'nick5' for mir in nick5})
clvtypes.update({mir:'nick3' for mir in nick3})
clvtypes.update({mir:'non-specific' for mir in nonspec})
print len(clvtypes)

1746


### 7. Comparison with Data w/o SRSF

In [51]:
s6uniq = s6[~s6.index.duplicated()]
productive_wo_srsf3 = {m:2**cp-1 for m,cp in s6uniq['Cleavage Productivity'].items()}
productive_w_srsf3 = {m:2**cp-1 for m,cp in normprods.items()}

In [52]:
s6clv = s6[s6['Cleavage Specificity']>0].reset_index()
alternative_wo_srsf3 = { m:2**sorted(grp['Cleavage Specificity'])[-2]-1
                         if len(grp)>1 else 0 for m,grp in s6clv.groupby('Pri-miRNA') }
s8clv = clvall[(clvall['diff'].apply(abs)<=window)&(clvall['cleavage specificity']>0)]
alternative_w_srsf3 = { m:2**sorted(grp['cleavage specificity'])[-2]-1
                        if len(grp)>1 else 0 for m,grp in s8clv.groupby('miRNA') }

In [53]:
nick_wo_srsf3 = { m:grp['Cleavage Imbalance'].sum() for m,grp in s6clv.groupby('Pri-miRNA') }
nick_w_srsf3 = { m:(grp['5frag']-grp['3frag']).sum() for m,grp in s8clv.groupby('miRNA') }

In [54]:
s6det = s6uniq[s6uniq['Cleaved']>0]
inv_wo_srsf3 = (s6det['Inversely cleaved']/s6det['Cleaved']).to_dict()
inv_w_srsf3 = { m:invcnts[m]/clvcnts[m] for m in suffinps if clvcnts[m]>0 }

In [55]:
ovpro = set(productive_wo_srsf3.keys())&set(productive_w_srsf3.keys())
ovalt = set(alternative_wo_srsf3.keys())&set(alternative_w_srsf3.keys())
ovnic = set(nick_wo_srsf3.keys())&set(nick_w_srsf3.keys())
ovinv = set(inv_wo_srsf3.keys())&set(inv_w_srsf3.keys())
ovall = ovpro&ovalt&ovnic&ovinv
print len(ovall)

1502


### 8. Build table

In [56]:
prosmp = s6['Cleavage Productivity'].to_dict()
proclv = s6[abs(s6['Relative position from miRBase site'])<=window]\
.sort_values('Cleavage Specificity')
spesmp = proclv[~proclv.index.duplicated(keep='last')]['Cleavage Specificity'].to_dict()
print len(prosmp), len(spesmp)

1816 1816


In [57]:
clvpro = clvall[(clvall['diff'].apply(abs)<=window)&(clvall['cleavage specificity']>0)]
clvmax = clvpro.sort_values('cleavage specificity', ascending=False).drop_duplicates('miRNA')\
.set_index('miRNA')
print len(clvmax)
clvmax.head()

1571


Unnamed: 0_level_0,pstart,pend,hstart,hend,5frag,3frag,hairpin,cleavage specificity,diff
miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
hsa-mir-144,33,89,33,90,0.968697,0.980164,0.951562,0.975856,0
hsa-mir-7-3,31,125,31,90,0.981457,0.973863,0.918292,0.969286,0
hsa-mir-125a,31,90,31,90,0.980695,0.963694,0.925631,0.968403,0
hsa-mir-514b,33,90,33,90,0.967912,0.972678,0.917978,0.965585,0
hsa-mir-302b,32,90,32,90,0.937693,0.978191,0.94198,0.965412,0


In [58]:
mlist = [ mir for mir in overlap if mir in prosmp.keys() ]
mlistpro = [ mir for mir in mlist if mir in clvmax.index ]
mlistnp = [ mir for mir in mlist if mir not in clvmax.index ]
print len(mlist), len(mlistpro), len(mlistnp)

1745 1570 175


In [59]:
def get_clv_info(mir, hstart, hend):
    chrom, pstart, pend, strand = s2.loc[mir,['Chr','Start','End','Strand']]
    if strand=='+':
        clv5 = pstart+hstart-1
        clv3 = pstart+hend-1
    else:
        clv5 = pend-hstart+1
        clv3 = pend-hend+1
    return chrom, clv5, clv3, strand

In [60]:
cols = [ 'Pri-miRNA', 'Chr', "5' cleavage site", "3' cleavage site", 'Strand', 'Cleavage type',
         'CP_wo_SRSF3', 'CP_w_SRSF3', 'CS_wo_SRSF3', 'CS_w_SRSF3', 'Productive_wo_SRSF3',
         'Productive_w_SRSF3', 'Alternative_wo_SRSF3', 'Alternative_w_SRSF3', 'Nick_wo_SRSF3',
         'Nick_w_SRSF3', 'Inverted_wo_SRSF3', 'Inverted_w_SRSF3' ]

In [61]:
tbl = pd.DataFrame(columns=cols)
for mir in mlistpro:
    hs, he, mcs = clvmax.loc[mir,['hstart','hend','cleavage specificity']]
    chrom, cs5, cs3, strand = get_clv_info(mir, hs, he)
    ctype = clvtypes[mir]
    cp_wo_srsf3 = prosmp[mir]
    cp_w_srsf3 = normprods[mir]
    cs_wo_srsf3 = spesmp[mir]
    cs_w_srsf3 = mcs
    if mir in ovall:
        tbl = tbl.append(dict(zip(cols,[mir,chrom,cs5,cs3,strand,ctype,cp_wo_srsf3,cp_w_srsf3, 
                    cs_wo_srsf3,cs_w_srsf3,productive_wo_srsf3[mir],productive_w_srsf3[mir],
                    alternative_wo_srsf3[mir],alternative_w_srsf3[mir],nick_wo_srsf3[mir],
                    nick_w_srsf3[mir],inv_wo_srsf3[mir],inv_w_srsf3[mir]])),ignore_index=True)
    else:
        tbl = tbl.append(dict(zip(cols,[mir,chrom,cs5,cs3,strand,ctype,cp_wo_srsf3,cp_w_srsf3, 
                    cs_wo_srsf3,cs_w_srsf3])),ignore_index=True)

In [62]:
for mir in mlistnp:
    cp_wo_srsf3 = prosmp[mir]
    cp_w_srsf3 = normprods[mir]
    cs_wo_srsf3 = spesmp[mir]
    ctype = clvtypes[mir]
    tbl = tbl.append(dict(zip(cols,[mir,'n.a.','n.a.','n.a.','n.a.',ctype,cp_wo_srsf3,cp_w_srsf3,
                        cs_wo_srsf3,0])), ignore_index=True)

In [63]:
tbl = tbl.sort_values('Pri-miRNA').set_index('Pri-miRNA')
print len(set(tbl.index))

1745


In [64]:
tbl.head()

Unnamed: 0_level_0,Chr,5' cleavage site,3' cleavage site,Strand,Cleavage type,CP_wo_SRSF3,CP_w_SRSF3,CS_wo_SRSF3,CS_w_SRSF3,Productive_wo_SRSF3,Productive_w_SRSF3,Alternative_wo_SRSF3,Alternative_w_SRSF3,Nick_wo_SRSF3,Nick_w_SRSF3,Inverted_wo_SRSF3,Inverted_w_SRSF3
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
hsa-let-7a-1,chr9,94176000.0,94176000.0,+,single,5.0472,10.189134,0.912688,0.906288,32.064234,1166.440704,0.017713,0.0,-0.009062,-0.038505,0.000719,0.001198
hsa-let-7a-2,chr11,122147000.0,122147000.0,-,single,4.199327,9.115665,0.838326,0.807428,17.370601,553.73893,0.0,0.0261171,-0.184113,-0.059823,0.015811,0.040072
hsa-let-7a-3,chr22,46112800.0,46112800.0,+,single,2.40594,6.810394,0.584654,0.709306,4.299808,111.236207,0.095455,0.0986152,0.001902,0.009714,0.022172,0.005435
hsa-let-7b,chr22,46113700.0,46113800.0,+,single,3.906021,9.640643,0.821828,0.817137,13.990958,797.220308,0.009933,0.0,-0.030746,-0.02585,0.015024,0.003285
hsa-let-7c,chr21,16539800.0,16539900.0,+,single,0.553681,8.199906,0.16319,0.738939,0.467826,293.047554,0.008762,0.029409,0.050677,-0.080467,0.561295,0.026231


In [65]:
tbl.to_csv('resources/201012_s8_cleavage-patterns-srsf3.csv')

In [66]:
out = open('supplementary/201012_s8_cleavage-patterns-srsf3.csv', 'wt')
description = 'Supplementary Table 8. Microprocessor IVP with SRSF3 \n\n\n\n\n'
out.write(description)
for l in open('resources/201012_s8_cleavage-patterns-srsf3.csv', 'rt'):
    out.write(l)
out.close()