# Table S1. Pri-miRNAs selected for SHAPE-MaP.
- (construct sequence, 5p sequence & 3p sequences) X (pri-miRNAs & controls)
- based on MirGeneDB
---
1. Parse data
2. Build table

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'S. Chan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by S. Chan Baek at 2024-01-12.


In [2]:
HOME = '/casa/bsc/projects/2_Structure-of-pri/2007_paper_prep'
%cd $HOME

/casa/bsc/projects/2_Structure-of-pri/2007_paper_prep


In [3]:
from __future__ import division
from Bio import SeqIO
import pandas as pd

## 1. Parse data
- miRNA list and annotation information was retrived from [MirGeneDB v1.1](https://old.mirgenedb.org/gff/hsa?node=0&all=1&sort_desc=False&seed=&query=&sorted_by=name&fnode=0).
- miRNA annotation information was adjusted by [MirGeneDB v2](https://mirgenedb.org/gff/ALL?sort=pos&all=1).

#### miRNA list (mirgenedb v1)

In [4]:
dbv1f = 'publication/mirgenedb_v1.gff'
dbv1 = pd.read_table(dbv1f, sep='\t', header=2, usecols=[0,2,3,4,6,8], 
                     names=['chr','type','start','end','strand','info'])
dbv1['db id'] = dbv1['info'].apply(lambda x:x.split(';')[0].split('=')[1]) # Hsa-Let-7-P1_pre
dbv1['db name'] = dbv1['db id'].apply(lambda x:x.split('_')[0]) # Hsa-Let-7-P1
dbv1pre = dbv1[dbv1['type']=='miRNA_precursor']
dbv1pre = dbv1pre[dbv1pre['info'].apply(lambda x:x.find('Alias')>=0)]
dbv1pre['mb name'] = dbv1pre['info'].apply(lambda x:x.split(';')[1].split('=')[1])
allmirs = dbv1pre['mb name'].tolist()
print len(allmirs)

519


In [5]:
mbdbv1name = dbv1pre.set_index('mb name')['db name'].to_dict()
print mbdbv1name['hsa-let-7a-2']

Hsa-Let-7-P1


#### miRBase information (for parsing mirgenedb v2)

In [8]:
def parse_line(row):
    for s in row['attr'].split(';'):
        tag, value = s.split('=')
        row[tag] = value
    return row 

def load_mirbase_annot(gff):
    anntbl = pd.read_table(gff, header=12, sep='\t', usecols=[0,2,3,4,6,8], 
                           names=['chr','kind','start','end','strand','attr'])
    anntbl = anntbl.apply(parse_line, axis=1).drop(['attr'], axis=1)
    return anntbl

In [12]:
gff = 'publication/human_mirbase-v21.gff3'
anntbl = load_mirbase_annot(gff)
anntbl = anntbl.set_index('ID')
anntbl = anntbl[anntbl['kind']=='miRNA_primary_transcript']
anntbl.head()

Unnamed: 0_level_0,Alias,Derives_from,Name,chr,end,kind,start,strand
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MI0022705,MI0022705,,hsa-mir-6859-1,chr1,17436,miRNA_primary_transcript,17369,-
MI0006363,MI0006363,,hsa-mir-1302-2,chr1,30503,miRNA_primary_transcript,30366,+
MI0026420,MI0026420,,hsa-mir-6859-2,chr1,187958,miRNA_primary_transcript,187891,-
MI0022558,MI0022558,,hsa-mir-6723,chr1,632413,miRNA_primary_transcript,632325,-
MI0000342,MI0000342,,hsa-mir-200b,chr1,1167198,miRNA_primary_transcript,1167104,+


#### miRNA annotation information (mirgenedb v2)

In [13]:
dbv2f = 'resources/mirgenedb_v2.gff'
dbv2 = pd.read_table(dbv2f, sep='\t', header=2, usecols=[0,2,3,4,6,8], 
                     names=['chr','type','start','end','strand','info'])
dbv2.head(3)

Unnamed: 0,chr,type,start,end,strand,info
0,chr1,pre_miRNA,1167124,1167182,+,ID=Hsa-Mir-8-P2a_pre;Alias=MI0000342
1,chr1,miRNA,1167124,1167145,+,ID=Hsa-Mir-8-P2a_5p*;Alias=MIMAT0004571
2,chr1,miRNA,1167160,1167182,+,ID=Hsa-Mir-8-P2a_3p;Alias=MIMAT0000318


In [14]:
dbv2['db id'] = dbv2['info'].apply(lambda x:x.split(';')[0].split('=')[1])
dbv2['db name'] = dbv2['db id'].apply(lambda x:x.split('_')[0])
dbv2pre = dbv2[dbv2['type']=='pre_miRNA'].sort_values('db name')
dbv2pre = dbv2pre[dbv2pre['info'].apply(lambda x:x.find('Alias')>=0)]
dbv2pre['mb id'] = dbv2pre['info'].apply(lambda x:x.split(';')[1].split('=')[1])
dbv2pre['mb name'] = dbv2pre['mb id'].apply(lambda x: anntbl.loc[x,'Name'])
dbv2pre = dbv2pre.drop_duplicates('mb name')

In [15]:
mbdbv2name = dbv2pre.set_index('mb name')['db name'].to_dict()
print mbdbv2name['hsa-let-7a-1']

Hsa-Let-7-P2a1


In [16]:
seqsv1 = { s.id:str(s.seq) for s in SeqIO.parse('resources/mirgenedb_v1.fa', 'fasta') }
seqsv2 = { s.id:str(s.seq) for s in SeqIO.parse('resources/mirgenedb_v2.fa', 'fasta') }
print sorted(seqsv1.keys())[:4]

['Hsa-Let-7-P10_3p*', 'Hsa-Let-7-P10_5p', 'Hsa-Let-7-P10_loop', 'Hsa-Let-7-P10_pre']


In [18]:
mir = 'hsa-mir-99a'
if mir in mbdbv2name:
    names = dbv2[dbv2['db name']==mbdbv2name[mir]]['db id'].tolist()
    seqs = [ seqsv2[name] for name in names ]
else:
    names = dbv1[dbv1['db name']==mbdbv1name[mir]]['db id'].tolist()
    seqs = [ seqsv1[name] for name in names ]
print names, seqs

['Hsa-Mir-10-P2c_pre', 'Hsa-Mir-10-P2c_5p', 'Hsa-Mir-10-P2c_3p*'] ['AACCCGUAGAUCCGAUCUUGUGGUGAAGUGGACCGCACAAGCUCGCUUCUAUGGGUCUGU', 'AACCCGUAGAUCCGAUCUUGUG', 'CAAGCUCGCUUCUAUGGGUCUGU']


## 2. Build table

In [20]:
tbl = pd.DataFrame(columns=['Precursor','5p','3p','Guide'], index=allmirs)
for mir in allmirs:
    if mir in mbdbv2name:
        names = dbv2[dbv2['db name']==mbdbv2name[mir]]['db id'].tolist()
        seqs = [ seqsv2[name] for name in names ]
    else:
        names = dbv1[dbv1['db name']==mbdbv1name[mir]]['db id'].tolist()
        seqs = [ seqsv1[name] for name in names ]
    if len(names)<3:
        print mir; continue
    for col, seq in zip(['Precursor','5p','3p'], seqs):
        tbl.loc[mir, col] = seq
    guide = [ name for name in names[1:] if not name.endswith('*') ][0].split('_')[1]
    tbl.loc[mir, 'Guide'] = guide
tbl.head(1)

Unnamed: 0,Precursor,5p,3p,Guide
hsa-let-7a-2,UGAGGUAGUAGGUUGUAUAGUUUAGAAUUACAUCAAGGGAGAUAAC...,UGAGGUAGUAGGUUGUAUAGUU,CUGUACAGCCUCCUAGCUUUCC,5p


In [21]:
tbl.sort_index().to_csv('publication/TableS1__Pri-miRNAs_selected_for_SHAPE-MaP.csv')