# Supplementary Table 7. Cleavage sites
1. Load information
2. Cleavage site
3. Productive Cleavage Specificity
4. Build table

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-10-13.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
from __future__ import division
import sys; sys.path.append('/casa/bsc/notebooks/')
from util import *
from basic import gen_result_dir
resultpath = gen_result_dir('results/')
print 'resultpath:', resultpath

resultpath: results/201013/


In [4]:
import pandas as pd
import numpy as np
import re
import gzip
from matplotlib import pyplot as plt
from matplotlib import gridspec, lines
%matplotlib inline

In [5]:
LIBRARIES = [ 'set1', 'set2', 'set3', 'set4', 'set5' ]

#### Import supplementary tables

In [6]:
s1 = pd.read_csv('supplementary/201012_s1_pri-info.csv', header=1, index_col=0)
s2 = pd.read_csv('supplementary/201012_s2_pri-construct.csv', header=1, index_col=0)
s4 = pd.read_csv('supplementary/201012_s4_cleavage-product.csv', header=1)
s5 = pd.read_csv('supplementary/201012_s5_pri-structure.csv', header=1, index_col=0)
s6 = pd.read_csv('supplementary/201012_s6_cleavage-patterns.csv', header=1, index_col=0)
print 's1:\t%s'%', '.join(list(s1.columns))
print 's2:\t%s'%', '.join(list(s2.columns)[:6])
print 's4:\t%s'%', '.join(list(s4.columns)[:6])
print 's5:\t%s'%', '.join(list(s5.columns)[-10:])
print 's6:\t%s'%', '.join(list(s6.columns)[-4:])

s1:	5p mature, 5p sequence, 3p mature, 3p sequence, Note
s2:	Chr, Start, End, Strand, Construct sequence, 100way phyloP scores (pre-miRNA -/+ 100nt)
s4:	Pri-miRNA, rstart, rend, pilot-1, pilot-2, set1-1
s5:	117, 118, 119, 120, 121, 122, 123, 124, 125, symbolized structure
s6:	Relative position from miRBase site, Cleavage Specificity, Cleavage Imbalance, Cleavage type


In [7]:
def get_pre_position(pri):
    constructseq = s2.loc[pri, 'Construct sequence'].replace('T','U')
    seq5p = s1.loc[pri, '5p sequence']
    seq3p = s1.loc[pri, '3p sequence']
    if seq5p=='n.a.' or constructseq.find(seq5p)==-1:
        prestart = 0
    else:
        prestart = constructseq.find(seq5p)+1
    if seq3p=='n.a.' or constructseq.find(seq3p)==-1:
        preend = 0
    else:
        preend = constructseq.rfind(seq3p)+len(seq3p)
    return prestart, preend

In [8]:
allpris = s1.index
preposition = { pri:get_pre_position(pri) for pri in allpris }
print len(preposition)

1881


In [11]:
mirtrons = [ pri for pri in allpris if s1.loc[pri,'Note'].find('Mirtron')>=0 ]
capped = [ pri for pri in allpris if s1.loc[pri,'Note'].find('Capped miRNA')>=0 ]
drosensitive = [ pri for pri in allpris if s1.loc[pri,'Note'].find('DROSHA KO-sensitive')>=0 ]
fclipdetected = [ pri for pri in allpris if s1.loc[pri,'Note'].find('fCLIP site-determined')>=0 ]
dbmirs = [ pri for pri in allpris if s1.loc[pri,'Note'].find('MirGeneDB')>=0 ]
print len(mirtrons), len(capped), len(drosensitive), len(fclipdetected), len(dbmirs)

239 11 157 281 519


In [12]:
priseqs = dict(s2['Construct sequence'])
print len(priseqs)

1881


### 1. Load information

In [14]:
pros = s6['Cleavage Productivity'].to_dict()
s6sort = s6.sort_values('Cleavage Specificity')
spes = s6sort[~s6sort.index.duplicated(keep='last')]['Cleavage Specificity'].to_dict()
overlap = [ mir for mir in pros if mir in spes ]
print len(overlap)

1816


In [15]:
single = set(s6[s6['Cleavage type']=='single'].index)
multi = set(s6[s6['Cleavage type']=='multiple'].index)
inverted = set(s6[s6['Cleavage type']=='inverted'].index)
nick5 = set(s6[s6['Cleavage type']=='nick5'].index)
nick3 = set(s6[s6['Cleavage type']=='nick3'].index)
nick = list(nick5) + list(nick3)
nonspec = set(s6[s6['Cleavage type']=='non-specific'].index)
dcsmirs = list(single) + list(multi)
nodcs = list(inverted) + list(nick) + list(nonspec)
print 'Productive: %s, Unproductive: %s' % (len(dcsmirs), len(nodcs))
print 'Single: %s, Multiple: %s, Inverted: %s, Nick: %s, Non-specific: %s'\
% (len(single), len(multi), len(inverted), len(nick), len(nonspec))
print "5' nick: %s, 3' nick: %s" % (len(nick5), len(nick3))

Productive: 512, Unproductive: 1304
Single: 445, Multiple: 67, Inverted: 156, Nick: 107, Non-specific: 1041
5' nick: 72, 3' nick: 35


In [16]:
fclipexcel = pd.ExcelFile('supplementary/Kim_2017_s2.xlsx')
fclip293t = fclipexcel.parse('HEK293T_miRBase', index_col=0)
fcliphela = fclipexcel.parse('HeLa_miRBase', index_col=0)
both293t = fclip293t[(fclip293t['Group5p']!='Unidentified')
                     &(fclip293t['Group3p']!='Unidentified')]
bothhela = fcliphela[(fcliphela['Group5p']!='Unidentified')
                     &(fcliphela['Group3p']!='Unidentified')]

### 2. Cleavage site

In [17]:
altcut = .26

In [19]:
s7 = pd.DataFrame()
s6sort = s6.sort_values('Cleavage Specificity',ascending=False)
s6sig = s6sort[s6sort['Cleavage Specificity']>=altcut]
for mir in sorted(dcsmirs):
    ch, st, en, strand = get_pre_annot(mir)
    ivp5cs = s6sig.loc[[mir],"5' cleavage site"].tolist()
    ivp3cs = s6sig.loc[[mir],"3' cleavage site"].tolist()
    if mir in both293t.index:
        fcl5cs = map(int, str(both293t.loc[mir, 'fCLIPPosition5p']).split(';'))
        fcl3cs = map(int, str(both293t.loc[mir, 'fCLIPPosition3p']).split(';'))
    elif mir in bothhela.index:
        fcl5cs = map(int, str(bothhela.loc[mir, 'fCLIPPosition5p']).split(';'))
        fcl3cs = map(int, str(bothhela.loc[mir, 'fCLIPPosition3p']).split(';'))
    else:
        fcl5cs, fcl3cs = [], []
    
    s7.loc[mir, 'Chr'] = ch
    if strand=='+':
        s7.loc[mir, "miRBase 5' site"] = str(st)
        s7.loc[mir, "miRBase 3' site"] = str(en)
    else:
        s7.loc[mir, "miRBase 5' site"] = str(en)
        s7.loc[mir, "miRBase 3' site"] = str(st)
    s7.loc[mir, "IVP 5' site"] = ', '.join(map(str, ivp5cs))
    s7.loc[mir, "IVP 3' site"] = ', '.join(map(str, ivp3cs))
    s7.loc[mir, "fCLIP 5' site"] = ', '.join(map(str, fcl5cs))
    s7.loc[mir, "fCLIP 3' site"] = ', '.join(map(str, fcl3cs))
    s7.loc[mir, 'Strand'] = strand
    
    if ivp5cs and fcl5cs:
        if set(ivp5cs)==set(fcl5cs):
            t = 'I'
        elif set(ivp5cs)>=set(fcl5cs):
            t = 'II'
        elif set(fcl5cs)>=set(ivp5cs):
            t = 'III'
        else:
            t = 'IV'
    else: t = ''
    s7.loc[mir, 'Type'] = t
s7.head()

Unnamed: 0,Chr,miRBase 5' site,miRBase 3' site,IVP 5' site,IVP 3' site,fCLIP 5' site,fCLIP 3' site,Strand,Type
hsa-let-7a-1,chr9,94175962,94176033,94175962,94176033,94175962,94176033,+,I
hsa-let-7a-2,chr11,122146589,122146523,122146589,122146523,122146589,122146523,-,I
hsa-let-7a-3,chr22,46112752,46112820,46112752,46112821,46112752,"46112820, 46112821",+,I
hsa-let-7b,chr22,46113691,46113766,46113691,46113765,"46113691, 46113699",46113765,+,III
hsa-let-7d,chr9,94178841,94178916,"94178841, 94178840","94178915, 94178916",94178841,94178915,+,II


### 3. Productive Cleavage Specificity

In [20]:
window = 3
prorange = 5
fclrcut = 7
ivprcut = 30
fcutoff = .01
cells = [ '293T','Hela' ]

In [21]:
s4.columns = ['Pri-miRNA','start','end']+list(s4.columns[3:])
clvraw = s4.set_index(['Pri-miRNA','start','end'])
clvtbl = pd.DataFrame()
for lib in LIBRARIES:
    clvtbl[lib] = clvraw[['%s-1'%lib,'%s-2'%lib]].sum(axis=1)
clvtbl.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,set1,set2,set3,set4,set5
Pri-miRNA,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hsa-let-7a-1,1,8,4,4,0,0,0


In [22]:
fcols = [ 'chr', 'start', 'end', 'miRNA', 'strand' ]
fcliptbl = pd.DataFrame()
for cell in cells:
    bedfile = 'supplementary/fCLIP/%s/pri_%s.small.bed.gz'%(cell,cell)
    ftbl = pd.read_table(gzip.open(bedfile,'rb'), usecols=[0,1,2,3,5], names=fcols)
    ftbl['start'] = ftbl['start']+1
    ftbl['end'] = ftbl['end']
    fcliptbl[cell] = ftbl.groupby(['miRNA', 'start', 'end']).size()
fcliptbl = fcliptbl.fillna(0).astype(int)
fcliptbl.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,293T,Hela
miRNA,start,end,Unnamed: 3_level_1,Unnamed: 4_level_1
hsa-let-7a-1,93920077,94175961,1,0


In [23]:
def select_lib(mir):
    return max(LIBRARIES, key=lambda x: clvtbl.loc[mir,x].sum())

In [24]:
def get_relative_pos(mir, c5, c3):
    ps, pe, strand = s2.loc[mir, ['Start','End','Strand']]
    if strand=='+':
        return c5-ps+1, c3-ps+1
    return pe-c3+1, pe-c5+1

In [25]:
def get_prod_fracs(sub, lib, ps, pe, strand, rcut, fcut, prorange):
    frac5, frac3, frach = {}, {}, {}
    if strand=='+':
        clv5 = sub[(sub['end']-ps+1).apply(abs)<=prorange].astype(int)
        clv3 = sub[(sub['start']-pe-1).apply(abs)<=prorange].astype(int)
        clvh = sub[(abs(sub['start']-ps)<=prorange)&
                   (abs(sub['end']-pe)<=prorange)].astype(int)
        if clv5[lib].sum()>=rcut: 
            frac5 = { e-ps:grp[lib].sum()/clv5[lib].sum() for e,grp in clv5.groupby('end')}
        if clv3[lib].sum()>=rcut: 
            frac3 = { s-pe:grp[lib].sum()/clv3[lib].sum() for s,grp in clv3.groupby('start')}
        if clvh[lib].sum()>=rcut: 
            frach = { (s-ps,e-pe):grp[lib].sum()/clvh[lib].sum() for
                      (s,e),grp in clvh.groupby(['start','end']) }
    else:
        clv5 = sub[(sub['start']-pe-1).apply(abs)<=prorange].astype(int)
        clv3 = sub[(sub['end']-ps+1).apply(abs)<=prorange].astype(int)
        clvh = sub[(abs(sub['start']-ps)<=prorange)
                   &(abs(sub['end']-pe)<=prorange)].astype(int)
        if clv5[lib].sum()>=rcut: 
            frac5 = { pe-s:grp[lib].sum()/clv5[lib].sum() for s,grp in clv5.groupby('start')}
        if clv3[lib].sum()>=rcut: 
            frac3 = { ps-e:grp[lib].sum()/clv3[lib].sum() for e,grp in clv3.groupby('end')}
        if clvh[lib].sum()>=rcut: 
            frach = { (pe-e,ps-s):grp[lib].sum()/clvh[lib].sum() for
                      (s,e),grp in clvh.groupby(['start','end']) }
    frac5 = { p:f for p,f in frac5.items() if f>=fcut }
    frac3 = { p:f for p,f in frac3.items() if f>=fcut }
    frach = { p:f for p,f in frach.items() if f>=fcut }    
    return (frac5, frac3, frach)

In [26]:
mir = 'hsa-mir-142'
sub = fcliptbl.loc[mir].reset_index()
pres, pree, strand = get_pre_annot(mir)[1:4]
f5, f3, fh = get_prod_fracs(sub, '293T', pres, pree, strand, fclrcut, fcutoff, prorange)
print f5
print f3
print fh

{0: 0.2857142857142857, -4: 0.14285714285714285, -1: 0.5714285714285714}
{0: 0.1, 1: 0.6, 4: 0.1, 5: 0.2}
{}


In [27]:
sub = clvtbl.loc[mir].reset_index()
pres, pree = get_relative_pos(mir, get_pre_annot(mir)[1], get_pre_annot(mir)[2])
f5, f3, fh = get_prod_fracs(sub, select_lib(mir), pres, pree, '+', ivprcut, fcutoff, prorange)
print f5
print f3
print fh

{3: 0.026426174496644295, -4: 0.0854026845637584, -3: 0.8416107382550335, -1: 0.015939597315436243, -5: 0.014681208053691275}
{1: 0.011968734733756717, 3: 0.8685067578570266, 4: 0.030695326494056343, -3: 0.06497313141182218}
{(1, 2): 0.010861132660977503, (0, 0): 0.04809930178432894, (-2, 2): 0.8572536850271528, (2, 2): 0.017067494181536073}


In [28]:
fclipmirs = sorted(list(set(fcliptbl.index.get_level_values(0))))
fclipfracs = {}
for mir in fclipmirs:
    sub = fcliptbl.loc[mir].reset_index()
    ps, pe, strand = get_pre_annot(mir)[1:4]
    frac5, frac3, frach = get_prod_fracs(sub, '293T', ps, pe, strand, fclrcut, fcutoff, prorange)
    if sum(map(bool, [frac5,frac3,frach]))>=2:
        fclipfracs[mir] = (frac5, frac3, frach)
        continue
    frac5, frac3, frach = get_prod_fracs(sub, 'Hela', ps, pe, strand, fclrcut, fcutoff, prorange)
    if sum(map(bool, [frac5,frac3,frach]))>=2:
        fclipfracs[mir] = (frac5, frac3, frach)
print len(fclipfracs)

248


In [29]:
ivpfracs = {}
ivpmirs = [ mir for mir in set(s4['Pri-miRNA']) if mir in fclipfracs.keys() ]
for mir in ivpmirs:
    sub = clvtbl.loc[mir].reset_index()
    ps, pe = get_relative_pos(mir, get_pre_annot(mir)[1], get_pre_annot(mir)[2])
    lib = select_lib(mir)
    frac5, frac3, frach = get_prod_fracs(sub, lib, ps, pe, '+', ivprcut, fcutoff, prorange)
    if sum(map(bool, [frac5,frac3,frach]))>=2:
        ivpfracs[mir] = (frac5, frac3, frach)
print len(ivpfracs)

244


In [30]:
def add_specificity(row):
    row['cleavage specificity'] = np.log2((row['5frag']+row['hairpin']+row['3frag'])/3+1)
    return row

In [31]:
def drop_duplicates(sortedtbl):
    if len(sortedtbl)<=1:
        return sortedtbl
    hs = sortedtbl['hstart'].tolist()[0]
    he = sortedtbl['hend'].tolist()[0]
    sub = sortedtbl.iloc[1:]
    sub = sub[(sub['hstart']!=hs)&(sub['hend']!=he)]
    return sortedtbl.iloc[:1].append(drop_duplicates(sub))

In [32]:
COLS = [ 'miRNA', 'hstart', 'hend', '5frag', '3frag', 'hairpin', 'cleavage specificity' ] 
def get_dcs(mir, clv5, clv3, clvhpn):
    clvsites = pd.DataFrame({0:dict(zip(COLS,[mir,-99,99,0,0,0,0]))}).T[COLS]
    clv52 = defaultdict(float); clv52.update(clv5)
    clv32 = defaultdict(float); clv32.update(clv3)
    clvhpn2 = defaultdict(float); clvhpn2.update(clvhpn)
    for c5, frac5 in clv5.items():
        for c3, frac3 in clv3.items():
            row = dict(zip(COLS,[mir,c5+1,c3-1,frac5,frac3,clvhpn2[(c5+1,c3-1)]]))
            clvsites = clvsites.append(row, ignore_index=True)
    for (c5,c3), frach in clvhpn.items():
        row = dict(zip(COLS,[mir,c5,c3,clv52[c5-1],clv32[c3+1],frach]))
        clvsites = clvsites.append(row, ignore_index=True)
    cs = clvsites.apply(add_specificity,axis=1).sort_values('cleavage specificity').iloc[::-1]
    return drop_duplicates(cs)

In [33]:
frac5, frac3, frach = fclipfracs['hsa-mir-142']
get_dcs('hsa-mir-142', frac5, frac3, frach)

Unnamed: 0,miRNA,hstart,hend,5frag,3frag,hairpin,cleavage specificity
10,hsa-mir-142,0,0,0.571429,0.6,0.0,0.475579
4,hsa-mir-142,1,4,0.285714,0.2,0.0,0.216492
7,hsa-mir-142,-3,3,0.142857,0.1,0.0,0.112303
0,hsa-mir-142,-99,99,0.0,0.0,0.0,0.0


In [34]:
frac5, frac3, frach = ivpfracs['hsa-mir-142']
get_dcs('hsa-mir-142', frac5, frac3, frach).head()

Unnamed: 0,miRNA,hstart,hend,5frag,3frag,hairpin,cleavage specificity
23,hsa-mir-142,-2,2,0.841611,0.868507,0.857254,0.892034
8,hsa-mir-142,-3,-4,0.085403,0.064973,0.0,0.070561
13,hsa-mir-142,0,0,0.01594,0.011969,0.048099,0.036097
3,hsa-mir-142,4,3,0.026426,0.030695,0.0,0.027211
0,hsa-mir-142,-99,99,0.0,0.0,0.0,0.0


In [35]:
fclipclv = pd.DataFrame(columns=COLS)
ivpclv = pd.DataFrame(columns=COLS)
for mir in ivpfracs.keys():
    frac5, frac3, frach = fclipfracs[mir]
    fclipclv = fclipclv.append(get_dcs(mir, frac5, frac3, frach), ignore_index=True)
    frac5, frac3, frach = ivpfracs[mir]
    ivpclv = ivpclv.append(get_dcs(mir, frac5, frac3, frach), ignore_index=True)

fclipclv = fclipclv[COLS]
fclipclv['Position'] = fclipclv[['hstart','hend']].apply(abs,axis=1).min(axis=1)
fclipclv = fclipclv[fclipclv['Position']<=window]
ivpclv = ivpclv[COLS]
ivpclv['Position'] = ivpclv[['hstart','hend']].apply(abs,axis=1).min(axis=1)
ivpclv = ivpclv[ivpclv['Position']<=window]

In [36]:
fspes = dict(fclipclv.drop_duplicates('miRNA').set_index('miRNA')['cleavage specificity'])
ispes = dict(ivpclv.drop_duplicates('miRNA').set_index('miRNA')['cleavage specificity'])

In [37]:
mirs = [m for m in fspes if m in ispes ]
print len(mirs)

243


### 4. Build table

In [38]:
for mir in sorted(mirs):
    s7.loc[mir, 'IVP PCS'] = ispes[mir]
    s7.loc[mir, 'fCLIP PCS'] = fspes[mir]

In [39]:
s7.fillna('').to_csv('resources/201012_s7_cleavage-sites.csv')

In [40]:
out = open('supplementary/201012_s7_cleavage-sites.csv', 'wt')
description = 'Supplementary Table 7. Cleavage sites \n\n\n\n\n'
out.write(description)
for l in open('resources/201012_s7_cleavage-sites.csv', 'rt'):
    out.write(l)
out.close()