# Supplementary Table 4. Pri-miRNA features
1. structure information (ct format)  
2. symbolized structure (M-match, S-symmetric IL, A-asymmetric IL/bulge, F-flanking segment, X-internal stem loop) 

In [2]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-11-25.


In [3]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [46]:
from __future__ import division
import pandas as pd
import sys; sys.path.append('/casa/bsc/notebooks/')
import re
from util import *

In [15]:
s1 = pd.read_csv('supplementary/201123_s1__DROSHA_dependency.csv', header=1, index_col=0)
s2 = pd.read_csv('supplementary/201123_s2__Construct.csv', header=1, index_col=0).fillna('')
s3 = pd.read_csv('supplementary/201123_s3__DROSHA_processing_result.csv', header=1, index_col=0)
allpris = s1.index
print len(allpris)

1881


In [19]:
constseqs = dict(s2['Construct sequence (125 nt)'])
print len(constseqs)

1881


In [6]:
preposition = {}
for mir in s2.index:
    flank5, flank3 = s2.loc[mir,["5' flanking segment","3' flanking segment"]]
    if flank5 and flank3:
        preposition[mir] = (len(flank5)+1,125-len(flank3))
    else:
        preposition[mir] = (0,125)
print len(preposition), preposition['hsa-mir-142']

1881 (32, 90)


In [17]:
EFFCUT = 2.5
HOMCUT = .25
effs = s3['Cleavage Efficiency'].to_dict()
homs = s3['Cleavage Homogeneity'].to_dict()
dcsmirs = [ m for m in s3.index if effs[m]>=EFFCUT and homs[m]>=HOMCUT ]
nodcs = [ m for m in s3.index if m not in dcsmirs ]
print len(s3.index), len(dcsmirs), len(nodcs)

1816 534 1282


In [12]:
cols = [ 'Pri-miRNA','Secondary structure','Stable lower stem','UG','UGU/G','mGHG','CNNC' ]
tbl = pd.DataFrame(columns=cols).set_index('Pri-miRNA')
tbl.head(1)

Unnamed: 0_level_0,Secondary structure,Stable lower stem,UG,UGU/G,mGHG,CNNC
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


### Secondary structure

In [9]:
def get_ct_info(mir):
    ct = 'supplementary/structures/%s.ct' % mir
    lines = open(ct, 'rt').read().split('\n')[1:-1]
    ctinfo = [ tuple(map(int, l.strip().split()[4:6][::-1])) for l in lines ]
    return ctinfo

In [10]:
def get_sym_str(mir):
    ps, pe = preposition[mir]
    stinfo = [ p for p in get_ct_info(mir) if p[1]>0 ] 
    ls = [ (m,n) for i,(m,n) in enumerate(stinfo[:-1]) if stinfo[i+1]==(n,m) and m>ps and n<pe ]
    if not ls:
        return 'F'*125
    loopst, loopen = min(ls, key=lambda (x,y):abs(120-x-y))
    stemst, stemen = [ (i,p) for i,p in stinfo if i<loopst and p>loopen ][0]
    pairs = [ (i,p) for i,p in stinfo if stemst<=i<=loopst and loopen<=p<=stemen ]
    pairs = pairs + [ (p,i) for i,p in pairs ][::-1]
    symst = ''
    for (l5,l3), (u5,u3) in zip(pairs[:-1],pairs[1:]):
        if l5==loopst: symst += 'M'+'L'*(loopen-loopst-1)
        elif u5-l5==1: symst += 'M'
        else:
            symmet = min(u5-l5-1, l3-u3-1)
            symst += 'M' + (u5-l5-1-symmet)*'A' + symmet*'S'
    symst = 'F'*(stemst-1) + symst + 'M' + 'F'*(125-stemen)
    return symst

In [13]:
for pri in allpris:
    tbl.loc[pri, 'Secondary structure'] = get_sym_str(pri)

### Sequence motifs

In [18]:
majorcs = {}
for mir in dcsmirs:
    c5, c3 = s3.loc[[mir],["5' cleavage site","3' cleavage site"]].iloc[0]
    majorcs[mir] = (int(c5),int(c3))
print len(majorcs), majorcs['hsa-let-7a-1']

534 (25, 96)


In [61]:
seq5tbl, seq3tbl = pd.DataFrame(columns=range(-20,30)), pd.DataFrame(columns=range(-25,30))
str5tbl, str3tbl = pd.DataFrame(columns=range(-20,30)), pd.DataFrame(columns=range(-25,30))

In [62]:
for mir in s3.index:
    if mir in dcsmirs:
        ps, pe = majorcs[mir]
    else:
        ps, pe = preposition[mir]
    priseq = constseqs[mir].replace('T','U')
    strt = [ 1 if s=='M' else 0 for s in tbl.loc[mir,'Secondary structure'] ]
    seq5, str5 = priseq[max(0,ps-21):ps+29][::-1], strt[max(0,ps-21):ps+29][::-1]
    seq3, str3 = priseq[pe-30:min(125,pe+25)], strt[pe-30:min(125,pe+25)]
    pos5, pos3 = range(-20,30)[::-1], range(-25,30)[::-1]
    seq5tbl.loc[mir] = pd.Series(dict(zip(pos5,seq5)[::-1]))
    seq3tbl.loc[mir] = pd.Series(dict(zip(pos3,seq3)[::-1]))
    str5tbl.loc[mir] = pd.Series(dict(zip(pos5,str5)[::-1]))
    str3tbl.loc[mir] = pd.Series(dict(zip(pos3,str3)[::-1]))
    
seq5tbl = seq5tbl.fillna('')
seq3tbl = seq3tbl.fillna('')

In [25]:
ghgexcel = pd.ExcelFile('supplementary/Kwon_2018_s4.xlsx')
scotbl = ghgexcel.parse('DODGERS scores',header=5,index_col=2)

In [20]:
def matching_bases(st, en, symstr):
    str5p = symstr[st-1:]
    str3p = symstr[:en][::-1]
    i,j = 0,0
    match = {}
    while i<len(str5p) and j<len(str3p):
        if str5p[i]=='L' or str3p[j]=='L':
            break
        if str5p[i]!='A' and str3p[j]!='A':
            match[i]=j; i+=1; j+=1
        elif str5p[i]=='A':
            i+=1
        elif str3p[j]=='A':
            j+=1
    return {i+st:en-j for i,j in match.items()}        

In [49]:
def get_ghgs(mir):
    symstr = tbl.loc[mir, 'Secondary structure']
    bjs = symstr.find('M')+1
    bje = symstr.rfind('M')+1
    matches = matching_bases(bjs,bje,symstr)
    priseq = constseqs[mir].replace('T','U')
    if mir in dcsmirs:
        ps = majorcs[mir][0]
    else:
        ps = preposition[mir][0]
    if len(matches)<3:
        return {}
    end = sorted(matches.keys())[-3]
    ghgs = {}
    for st in range(bjs,end+1):
        if st not in matches:
            continue
        if all([((st+i),(matches[st]-i)) in matches.items() for i in range(3)]):
            if st<ps:
                pos = -count_len(symstr[st-1:ps-1])
            else:
                pos = count_len(symstr[ps-1:st-1])
            ghg5p = priseq[st-1:st+2]
            ghg3p = priseq[matches[st]-3:matches[st]][::-1]
            ghgseq = ghg5p+ghg3p
            ghgs[pos+2] = scotbl.loc[ghgseq,'DODGERS score']
    return ghgs

In [50]:
ghgtbl = {}
for mir in s2.index:
    ghgtbl[mir] = get_ghgs(mir)
ghgtbl = pd.DataFrame(ghgtbl).T

In [35]:
def find_ug(mir, pos):
    seq = ''.join(seq5tbl.loc[mir,range(pos,pos+2)].fillna(''))
    return seq=='UG'

In [36]:
def find_ugu(mir, pos):
    seq = ''.join(seq5tbl.loc[mir,range(pos,pos+3)])
    return (seq=='UGU' or seq=='GUG')

In [37]:
def find_cnnc(mir, pos):
    seq = ''.join(seq3tbl.loc[mir,range(pos-3,pos+1)].fillna(''))
    return bool(re.search('C[A-Z][A-Z]C', seq))

In [38]:
GHGCUT = 38
def find_ghg(mir, pos): 
    return (ghgtbl.loc[mir,pos]>=GHGCUT)

In [44]:
motifs = [ 'CNNC', 'UG', 'GHG', 'UGU' ]
funcs = { 'UG':find_ug, 'UGU':find_ugu, 'GHG':find_ghg, 'CNNC':find_cnnc }
ranges = { 'UG':[-14],'UGU':range(20,25),'GHG':[-5], 'CNNC':range(-18,-15) }

In [51]:
motmirs = {}
for motif in motifs:
    ml = []
    for mir in s3.index:
        for pos in ranges[motif]:
            if funcs[motif](mir,pos):
                ml.append(mir)
                break
    print motif, len(ml)
    motmirs[motif] = ml

CNNC 617
UG 248
GHG 647
UGU 455


In [63]:
for mir in s3.index:
    if str5tbl.loc[mir,range(-13,0)].sum()>=10 and str3tbl.loc[mir,range(-11,0)].sum()>=8:
        tbl.loc[mir,'Stable lower stem'] = 'O'
    if mir in motmirs['GHG']:
        tbl.loc[mir,'mGHG'] = 'O'
    if mir in motmirs['UG']:
        tbl.loc[mir,'UG'] = 'O'
    if mir in motmirs['UGU']:
        tbl.loc[mir,'UGU/G'] = 'O'

In [68]:
tbl.to_csv('resources/201123_s4__MiRNA_features.csv')

In [69]:
out = open('supplementary/201123_s4__MiRNA_features.csv', 'wt')
description = 'Supplementary Table 5. Pri-miRNA features\n\n\n\n\n'
out.write(description)
for l in open('resources/201123_s4__MiRNA_features.csv', 'rt'):
    out.write(l)
out.close()