# Supplementary Table 1. Pri-miRNA information
1. mature miRNAs that derive from a pri-miRNA
2. mature miRNA sequences
3. DROSHA dependency - DROSHA KO-sensitive, DROSHA fCLIP-detected, Mirtrons, Capped
4. MirGeneDB  
5. Build table

In [2]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-10-12.


In [3]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [4]:
from __future__ import division
from Bio import SeqIO
from collections import defaultdict
import pandas as pd
import numpy as np

### 1. mature miRNAs that derive from a pri-miRNA

In [5]:
def parse_line(row):
    for s in row['attr'].split(';'):
        tag, value = s.split('=')
        row[tag] = value
    return row

In [6]:
# miBRase v21 gff file
gff = 'supplementary/human_mirbase-v21.gff3'
anntbl = pd.read_table(gff, header=12, sep='\t', usecols=[0,2,3,4,6,8], 
                       names=['chr','kind','start','end','strand','attr'])
anntbl = anntbl.apply(parse_line, axis=1).drop(['attr'], axis=1)
anntbl.head(3)

Unnamed: 0,Alias,Derives_from,ID,Name,chr,end,kind,start,strand
0,MI0022705,,MI0022705,hsa-mir-6859-1,chr1,17436,miRNA_primary_transcript,17369,-
1,MIMAT0027618,MI0022705,MIMAT0027618,hsa-miR-6859-5p,chr1,17431,miRNA,17409,-
2,MIMAT0027619,MI0022705,MIMAT0027619,hsa-miR-6859-3p,chr1,17391,miRNA,17369,-


In [7]:
annpri = anntbl[anntbl['kind']=='miRNA_primary_transcript'].set_index('Name')
allpris = sorted(annpri.index)
print len(allpris)

1881


In [8]:
# pri-mature matching
primat = {}
for pri, row in annpri.iterrows():
    matureDerives = anntbl[anntbl['Derives_from']==row['ID']]
    primat[pri] = matureDerives['Name'].tolist()

In [9]:
# hairpin sequence
pri = 'supplementary/hairpin_mirbase-v21.fa'
priseqs = { s.id:str(s.seq) for s in SeqIO.parse(pri, 'fasta') }
print len(priseqs)

28645


### 2. mature miRNA sequences

In [10]:
# miRBase v21 mature sequence fasta file
mature = 'supplementary/mature_mirbase-v21.fa'
matureseqs = { s.id:str(s.seq) for s in SeqIO.parse(mature, 'fasta') }
print len(matureseqs), matureseqs['hsa-miR-1-5p']

35828 ACAUACUUCUUUAUAUGCCCAU


### 3. DROSHA-dependency

In [11]:
## DROSHA-independent pri-miRNAs: mirtrons & capped 
# Kim et al. (2017) table s1.
grpexcel = pd.ExcelFile('supplementary/Kim_2017_s1.xlsx')
grptbl = grpexcel.parse('Sheet1', index_col=0)
grptbl.index = [ 'hsa-%s'%i.strip() for i in grptbl.index ]
capped = grptbl[grptbl['DROSHA dependency']=='Capped miRNA'].index
mirtrons = grptbl[grptbl['DROSHA dependency']=='Mirtron'].index
print len(capped), len(mirtrons)

11 239


In [12]:
## DROSHA-dependent pri-miRNAs: DROSHA KO-sensitive
# Kim et al. (2016) PNAS
koexcel = pd.ExcelFile('supplementary/Kim_2016_s1.xlsx')
droko = koexcel.parse('DroKO', index_col=0)
drosensi = droko[droko['Group'].apply(lambda x: x in [1,2,4])].index
drosensitive = [ pri for pri in allpris if set.intersection(set(primat[pri]),set(drosensi)) ]
print len(drosensitive)

157


In [13]:
## DROSHA-dependent pri-miRNAs: DROSHA fCLIP-detected
# Kim et al. (2017) Mol. Cell
fclipexcel = pd.ExcelFile('supplementary/Kim_2017_s2.xlsx')
f293t = fclipexcel.parse('HEK293T_miRBase', index_col=0)
fhela = fclipexcel.parse('HeLa_miRBase', index_col=0)
mirs293t = f293t[(f293t['Group5p']!='Unidentified')&(f293t['Group3p']!='Unidentified')].index
mirshela = fhela[(fhela['Group5p']!='Unidentified')&(fhela['Group3p']!='Unidentified')].index
fclipmirs = [ mir for mir in allpris if mir in mirs293t or mir in mirshela ]
print len(fclipmirs)

281


### 4. MirGeneDB

In [14]:
dbv1f = 'resources/mirgenedb_v1.gff'
dbv1 = pd.read_table(dbv1f, sep='\t', header=2, usecols=[0,2,3,4,6,8], 
                     names=['chr','type','start','end','strand','info'])
dbv1['db id'] = dbv1['info'].apply(lambda x:x.split(';')[0].split('=')[1]) # Hsa-Let-7-P1_pre
dbv1['db name'] = dbv1['db id'].apply(lambda x:x.split('_')[0]) # Hsa-Let-7-P1
dbv1pre = dbv1[dbv1['type']=='miRNA_precursor']
dbv1pre = dbv1pre[dbv1pre['info'].apply(lambda x:x.find('Alias')>=0)]
dbv1pre['mb name'] = dbv1pre['info'].apply(lambda x:x.split(';')[1].split('=')[1])
dbmirs = dbv1pre['mb name'].tolist()
print len(dbmirs)

519


### 5. Build table

In [15]:
## encode miRBase structure
strfile = 'supplementary/mirbase-v21.str'
strinfos = defaultdict(list)
mir = ''
for l in open(strfile, 'rt'):
    if l.startswith('>'):
        mir = l.split()[0][1:]
    else:
        strinfos[mir].append(l.replace('\n',''))

In [16]:
def concat_seq(infos, nucs):
    ss5, ds5, match, ds3, ss3 = infos
    seq = ''
    if ds5[0] in nucs:
        ss5 = '- '+ss5
    if ds3[0] in nucs:
        ss3 = '- '+ss3
        
    for s5, d5 in zip(ss5.split(), ds5.split()):
        seq += s5+d5
    seq += match[-1]
    for d3, s3 in zip(ds3.split()[::-1], ss3.split()[::-1]):
        seq += d3[::-1]+s3[::-1]    
    seq = seq.replace('-','').replace('|','').replace(' ','')
    return seq

In [17]:
def get_new_str(mir):
    nucs = 'ACGUacgu'
    ss5, ds5, match, ds3, ss3 = strinfos[mir][1:6]
    newinfo = [ '', '', '', '', '' ]
    
    for ss, ssnew, oppo, index in [ (ss5,'',ss3,0), (ss3,'',ss5,4) ]:
        for i,s in enumerate(ss):
            if s in nucs and i<match.find('|'): ssnew += 'F'
            elif s in nucs and i>match.rfind('|'): ssnew += 'L'
            elif s in nucs and oppo[i] in nucs: ssnew += 'S'
            elif s in nucs: ssnew += 'A'
            else: ssnew += s
        newinfo[index] = ssnew
    
    for ds, dsnew, index in [ (ds5,'',1), (ds3,'',3) ]:
        for i,s in enumerate(ds):
            if s in nucs and i>match.rfind('|'): dsnew += 'L'
            elif s in nucs: dsnew += 'M'
            else: dsnew += s
        newinfo[index] = dsnew
    
    if match[-1] in nucs: newinfo[2] = match[:-1] + 'L'
    else: newinfo[2] = match   
    return concat_seq(newinfo, 'FLSAM')

In [18]:
def determine_arm(pri, priseq, matseq):
    loopstart = get_new_str(pri).find('L')
    if priseq.find(matseq)==-1:
        return 'error'
    if priseq.find(matseq)<loopstart:
        return '5p'
    return '3p'

In [19]:
# sanity check
for pri in primat:
    for mat in primat[pri]:
        priseq = priseqs[pri]
        matseq = matureseqs[mat]
        if determine_arm(pri, priseq, matseq)=='error':
            print pri, mat

In [36]:
cols = [ 'Pri-miRNA','5p mature','5p sequence','3p mature','3p sequence' ]
tbl = pd.DataFrame(columns=cols)
tbl['Pri-miRNA'] = allpris
tbl = tbl.set_index('Pri-miRNA').sort_index().fillna('n.a.')

In [37]:
for pri in primat:
    for mat in primat[pri]:
        priseq = priseqs[pri]
        matseq = matureseqs[mat]
        if mat.endswith('5p') or determine_arm(pri, priseq, matseq)=='5p':
            tbl.loc[pri, '5p mature'] = mat
            tbl.loc[pri, '5p sequence'] = matureseqs[mat]
        elif mat.endswith('3p') or determine_arm(pri, priseq, matseq)=='3p':
            tbl.loc[pri, '3p mature'] = mat
            tbl.loc[pri, '3p sequence'] = matureseqs[mat]

In [38]:
notes = defaultdict(list)

# MirGeneDB
for pri in dbmirs:
    notes[pri].append('MirGeneDB')
    
# DROSHA-independent
for pri in mirtrons:
    notes[pri].append('Mirtron')
for pri in capped:
    notes[pri].append('Capped miRNA')
    
# DROSHA-dependent
for pri in drosensitive:
    notes[pri].append('DROSHA KO-sensitive')
for pri in fclipmirs:
    notes[pri].append('fCLIP site-determined')

In [39]:
for pri in notes:
    tbl.loc[pri, 'Note'] = ', '.join(notes[pri])
tbl = tbl.fillna(' ')

In [40]:
tbl.to_csv('resources/201012_s1_pri-info.csv')

In [41]:
out = open('supplementary/201012_s1_pri-info.csv', 'wt')
description = 'Supplementary Table 1. Pri-miRNA information\n\n\n\n\n'
out.write(description)
for l in open('resources/201012_s1_pri-info.csv', 'rt'):
    out.write(l)
out.close()