# Supplementary Table 2. Pri-miRNA constructs

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2021-01-20.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
from __future__ import division
from Bio import SeqIO
from collections import defaultdict
from os import listdir
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import gzip
%matplotlib inline

In [31]:
cols = ['Pri-miRNA', 'Chr', 'Start', 'End', 'Strand', 'Construct sequence (125 nt)',
        "5' flanking segment", 'Pre-miRNA', "3' flanking segment"]
tbl = pd.DataFrame(columns=cols).set_index('Pri-miRNA')
tbl.head(1)

Unnamed: 0_level_0,Chr,Start,End,Strand,Construct sequence (125 nt),5' flanking segment,Pre-miRNA,3' flanking segment
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


---

In [5]:
## encode miRBase structure
strfile = 'supplementary/mirbase-v21.str'
strinfos = defaultdict(list)
mir = ''
for l in open(strfile, 'rt'):
    if l.startswith('>'):
        mir = l.split()[0][1:]
    else:
        strinfos[mir].append(l.replace('\n',''))

In [6]:
def concat_seq(infos, nucs):
    ss5, ds5, match, ds3, ss3 = infos
    seq = ''
    if ds5[0] in nucs:
        ss5 = '- '+ss5
    if ds3[0] in nucs:
        ss3 = '- '+ss3
        
    for s5, d5 in zip(ss5.split(), ds5.split()):
        seq += s5+d5
    seq += match[-1]
    for d3, s3 in zip(ds3.split()[::-1], ss3.split()[::-1]):
        seq += d3[::-1]+s3[::-1]    
    seq = seq.replace('-','').replace('|','').replace(' ','')
    return seq

In [7]:
mir = 'hsa-mir-16-1'
info = strinfos[mir]
print concat_seq(info[1:6], 'ACGUacgu')

gucagcagugccuUAGCAGCACGUAAAUAUUGGCGuuaagauucuaaaauuaucuCCAGUAUUAACUGUGCUGCUGAaguaagguugac


In [8]:
def get_new_str(mir):
    nucs = 'ACGUacgu'
    ss5, ds5, match, ds3, ss3 = strinfos[mir][1:6]
    newinfo = [ '', '', '', '', '' ]
    
    for ss, ssnew, oppo, index in [ (ss5,'',ss3,0), (ss3,'',ss5,4) ]:
        for i,s in enumerate(ss):
            if s in nucs and i<match.find('|'): ssnew += 'F'
            elif s in nucs and i>match.rfind('|'): ssnew += 'L'
            elif s in nucs and oppo[i] in nucs: ssnew += 'S'
            elif s in nucs: ssnew += 'A'
            else: ssnew += s
        newinfo[index] = ssnew
    
    for ds, dsnew, index in [ (ds5,'',1), (ds3,'',3) ]:
        for i,s in enumerate(ds):
            if s in nucs and i>match.rfind('|'): dsnew += 'L'
            elif s in nucs: dsnew += 'M'
            else: dsnew += s
        newinfo[index] = dsnew
    
    if match[-1] in nucs: newinfo[2] = match[:-1] + 'L'
    else: newinfo[2] = match   
    return concat_seq(newinfo, 'FLSAM')

In [9]:
mir = 'hsa-mir-16-1'
print get_new_str(mir)

MMMMMMSSMMMSMMMMMMMMMMMMSMMMMMMMMSMSMMMLLLLLLLLLLMMMSMSMMMMMMMMSMMAMMMMMMMMMMSMMMSSMMMMMM


In [10]:
## Find pre-miRNA annotation
def parse_line(row):
    for s in row['attr'].split(';'):
        tag, value = s.split('=')
        row[tag] = value
    return row

In [11]:
# miBRase v21 gff file
gff = 'supplementary/human_mirbase-v21.gff3'
anntbl = pd.read_table(gff, header=12, sep='\t', usecols=[0,2,3,4,6,8], 
                       names=['chr','kind','start','end','strand','attr'])
anntbl = anntbl.apply(parse_line, axis=1).drop(['attr'], axis=1)
anntbl.head(3)

Unnamed: 0,Alias,Derives_from,ID,Name,chr,end,kind,start,strand
0,MI0022705,,MI0022705,hsa-mir-6859-1,chr1,17436,miRNA_primary_transcript,17369,-
1,MIMAT0027618,MI0022705,MIMAT0027618,hsa-miR-6859-5p,chr1,17431,miRNA,17409,-
2,MIMAT0027619,MI0022705,MIMAT0027619,hsa-miR-6859-3p,chr1,17391,miRNA,17369,-


In [12]:
annpri = anntbl[anntbl['kind']=='miRNA_primary_transcript'].set_index('Name')
annmat = anntbl[anntbl['kind']=='miRNA'].set_index('ID')
allpris = sorted(annpri.index)
print len(allpris)

1881


In [13]:
# pri-mature matching
primat = {}
for pri, row in annpri.iterrows():
    matureDerives = anntbl[anntbl['Derives_from']==row['ID']]
    primat[pri] = matureDerives['ID'].tolist()

In [14]:
# hairpin sequence
pri = 'supplementary/hairpin_mirbase-v21.fa'
priseqs = { s.id:str(s.seq) for s in SeqIO.parse(pri, 'fasta') }
print len(priseqs)

28645


In [15]:
def count_len(strt):
    return strt.count('M')+strt.count('S')

In [16]:
def get_pre_seq(pri, relativepos, arm):
    overhang3 = 2
    priseq = priseqs[pri]
    pristr = get_new_str(pri)
    if arm=='5p':
        preend = [ i for i in range(len(pristr)+1) 
            if count_len(pristr[i:])<=max(0,count_len(pristr[:relativepos-1])-overhang3) ][0]
        return priseq[relativepos-1:preend]
    else: # 3p
        prestart = [ i+1 for i in range(len(pristr))
                     if count_len(pristr[:i])>=count_len(pristr[relativepos:])+overhang3 ][0]
        return priseq[prestart-1:relativepos]

In [17]:
def get_pre_annot(pri):
    matids = primat[pri]
    chrom, strand = annpri.loc[pri, ['chr','strand']]
    if len(matids)==2:
        start = min(annmat.loc[matids, 'start'])
        end = max(annmat.loc[matids, 'end'])
    elif strand=='+':
        pristart, priend = annpri.loc[pri, ['start','end']]
        matstart, matend = annmat.loc[matids[0], ['start','end']]
        if (matstart-pristart)<(priend-matend): # 5p
            start = matstart
            end = matstart + len(get_pre_seq(pri, matstart-pristart+1, '5p')) - 1
        else: # 3p
            end = matend
            start = matend - len(get_pre_seq(pri, matend-pristart+1, '3p')) + 1
    else: # strand=='-'
        pristart, priend = annpri.loc[pri, ['start','end']]
        matstart, matend = annmat.loc[matids[0], ['start','end']]
        if (matstart-pristart)<(priend-matend): # 3p
            start = matstart
            end = matstart + len(get_pre_seq(pri, priend-matstart+1, '3p')) - 1
        else: # 5p
            end = matend
            start = matend - len(get_pre_seq(pri, priend-matend+1, '5p')) + 1
    return chrom, start, end, strand

In [18]:
pri = 'hsa-mir-147b'
print get_pre_annot(pri)

('chr15', 45433060, 45433119, '+')


In [19]:
def rev_com_dna(seq):
    seq = seq.upper()
    seq = seq.replace('T', 'X')
    seq = seq.replace('A', 'T')
    seq = seq.replace('X', 'A')
    seq = seq.replace('G', 'Y')
    seq = seq.replace('C', 'G')
    seq = seq.replace('Y', 'C')
    return seq[::-1]

In [20]:
genome = {}
genomef = 'supplementary/hg38.fa.gz'
for g in SeqIO.parse(gzip.open(genomef, 'rb'), 'fasta'):
    genome[g.id] = str(g.seq)

In [21]:
adjust = {}
for l in open('supplementary/200414_construct_adjust.txt', 'rt'):
    pri, adj = l.split()
    adjust[pri] = int(adj)

In [24]:
manualannot = { 'hsa-mir-103b-1':(30,94), 'hsa-mir-103b-2':(30,94),
                'hsa-mir-130a':(26,88), 'hsa-mir-452':(28,89) }

In [32]:
extend5, extend3 = 60, 65
for pri in allpris:
    chrom, pstart, pend, strand = get_pre_annot(pri)
    preseq = genome[chrom][pstart-1:pend].upper()
    if strand=='+':
        pmid = (pstart+pend)//2 + adjust[pri]
        cstart, cend = pmid-extend5+1, pmid+extend3
        constructseq = genome[chrom][cstart-1:cend].upper()       
    else:
        pmid = (pstart+pend+1)//2 + adjust[pri]
        cstart, cend = pmid-extend3+1, pmid+extend5
        constructseq = rev_com_dna(genome[chrom][cstart-1:cend].upper())
        preseq = rev_com_dna(preseq)
    
    
    if constructseq.find(preseq)>=0:
        ps = constructseq.find(preseq)
        pe = constructseq.find(preseq)+len(preseq)
    else:
        print pri
        ps,pe = 0,125
    if pri in manualannot:
        ps,pe = manualannot[pri]
        
    flank5 = constructseq[:ps].replace('T','U')
    flank3 = constructseq[pe:].replace('T','U')
    preseq = preseq.replace('T','U')
    
    tbl.loc[pri,'Chr'] = chrom
    tbl.loc[pri,'Start'] = cstart
    tbl.loc[pri,'End'] = cend
    tbl.loc[pri,'Strand'] = strand
    tbl.loc[pri,'Construct sequence (125 nt)'] = constructseq
    tbl.loc[pri,"5' flanking segment"] = flank5
    tbl.loc[pri,"3' flanking segment"] = flank3
    tbl.loc[pri,"Pre-miRNA"] = preseq

hsa-mir-3648-1
hsa-mir-3648-2
hsa-mir-3652
hsa-mir-3976
hsa-mir-6753


In [33]:
tbl.to_csv('resources/200120_s2__Construct.csv')

In [34]:
out = open('supplementary/TableS2__Oligonucleotides_used_in_this_study.csv', 'wt')
description = 'Supplementary Table 2. Pri-miRNA construct\n\n\n\n\n'
out.write(description)
for l in open('resources/200120_s2__Construct.csv', 'rt'):
    out.write(l)
out.close()