# Supplementary Table 3. Input
1. List up pri-miRNAs with redundant construct sequence
2. Build table
3. Add redundant pri-miRNAs

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2020-10-13.


In [2]:
home = '/casa/bsc/projects/1_DCS/2004_paper_prep/'
%cd $home

/casa/bsc/projects/1_DCS/2004_paper_prep


In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import re
import gzip
from matplotlib import pyplot as plt
from collections import defaultdict, Counter
%matplotlib inline

### 1. List up pri-miRNAs with redundant construct sequence

In [4]:
## Pri-miRNAs of identical 125mer sequences
s2 = 'supplementary/201012_s2_pri-construct.csv'
s2tbl = pd.read_csv(s2, index_col=0, header=1)
s2redund = s2tbl[s2tbl['Note'].apply(lambda x: str(x).find('Same construct')>=0)]
redundRepre = {}
for pri, note in dict(s2redund['Note']).items():
    repre = note.split()[-1]
    redundRepre[pri] = repre
print len(redundRepre)

30


In [5]:
repreRedund = defaultdict(list)
for mir in redundRepre:
    repreRedund[redundRepre[mir]].append(mir)

### 2. Build table

In [6]:
allinputs = 'set1-1 set1-2 set2 set3-1 set3-2 set4 set5-1 set5-2'.split()
allpris = list(s2tbl.index)
inptbl = pd.DataFrame(columns=['Pri-miRNA']+allinputs)
inptbl['Pri-miRNA'] = allpris
inptbl = inptbl.set_index('Pri-miRNA').fillna(0)
inptbl.head()

Unnamed: 0_level_0,set1-1,set1-2,set2,set3-1,set3-2,set4,set5-1,set5-2
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
hsa-let-7a-1,0,0,0,0,0,0,0,0
hsa-let-7a-2,0,0,0,0,0,0,0,0
hsa-let-7a-3,0,0,0,0,0,0,0,0
hsa-let-7b,0,0,0,0,0,0,0,0
hsa-let-7c,0,0,0,0,0,0,0,0


In [7]:
for lib in allinputs:
    txt = 'input/alignments/%s.txt.gz' % lib
    mirs = !zcat $txt | cut -d" " -f1
    adjmirs = [ 'hsa-mir-1302-11' if m=='hsa-mir-1302-2' else m for m in mirs ]
    mircnts = pd.Series(Counter(adjmirs))
    inptbl[lib].update(mircnts)

In [8]:
print len(inptbl.index)
inptbl.head()

1881


Unnamed: 0_level_0,set1-1,set1-2,set2,set3-1,set3-2,set4,set5-1,set5-2
Pri-miRNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
hsa-let-7a-1,591,782,474,0,0,2,0,0
hsa-let-7a-2,969,1254,1252,0,0,1,0,0
hsa-let-7a-3,1401,1570,2158,0,0,8,0,0
hsa-let-7b,1046,1212,1146,0,0,1,0,0
hsa-let-7c,739,843,856,0,0,5,0,0


### 3. Add redundant pri-miRNAs

In [9]:
for pri in inptbl.index:
    if pri in repreRedund:
        for pri2 in repreRedund[pri]:
            inptbl.loc[pri2] = inptbl.loc[pri]

In [10]:
inptbl.sort_index().to_csv('resources/201012_s3_input.csv')

In [11]:
out = open('supplementary/201012_s3_input.csv', 'wt')
description = 'Supplementary Table 3. Input\n\n\n\n\n'
out.write(description)
for l in open('resources/201012_s3_input.csv', 'rt'):
    out.write(l)
out.close()