# In vitro processing results of pri-miRNA variants
- Kwon et al., 2019

### Dictionary
`zcat ../rawdata/dict.fastq.gz | cutadapt -j 32 -a TGGAATTCTCGGGTGCCAAGG -m 130 - | grep 'AGGGTCTACCGGGCCACCGC' - | gzip -c - > dict.txt.gz`
- Total reads processed:              42,140,068
- Reads with adapters:                   567,440 (1.3%)
- Reads that were too short:           1,168,017 (2.8%)
- Reads written (passing filters):    40,972,051 (97.2%)

`zcat dict.txt.gz | cut -c -155 | sort -u > dict_uniq.txt`  
`sed -n -e 's/\(^.*\)\(\(AGGGTCTACCGGGCCACCGC\).*\)/\1/p' dict_uniq.txt > dict_1barcode.txt`   
`sed -n -e 's/^.*AGGGTCTACCGGGCCACCGC\(.*\)/\1/p' dict_uniq.txt > dict_2pri.txt`  
`paste -d '\t' dict_1barcode.txt dict_2pri.txt > dict_split.txt`

### Input
`zcat ../rawdata/input_1.fastq.gz| cutadapt -j 32 -a AGGGTCTACCGGGCCACCGC --trimmed-only -m 25 -M 35 - | gzip -c - > input_barc_only.fastq.gz`
- Total reads processed:              36,893,479
- Reads with adapters:                36,718,291 (99.5%)
- Reads that were too short:             162,193 (0.4%)
- Reads that were too long:              178,143 (0.5%)
- Reads written (passing filters):    36,553,143 (99.1%)

`zcat input_barc_only.fastq.gz | awk 'NR % 4 == 2' | sort | uniq -c > input_counts.txt`

### Cleaved
`zcat ../rawdata/clv_1.fastq.gz| cutadapt -j 32 -a AGGGTCTACCGGGCCACCGC --trimmed-only -m 25 -M 35 - | gzip -c - > clv_barc_only.fastq.gz`
- Total reads processed:              45,121,437
- Reads with adapters:                44,850,336 (99.4%)
- Reads that were too short:             196,752 (0.4%)
- Reads that were too long:              283,947 (0.6%)
- Reads written (passing filters):    44,640,738 (98.9%)

`zcat clv_barc_only.fastq.gz | awk 'NR % 4 == 2' | sort | uniq -c > clv_counts.txt`

In [1]:
import time
today = time.strftime('%Y-%m-%d')
name = 'Seungchan Baek'
print 'Last revised by %s at %s.' % (name, today)

Last revised by Seungchan Baek at 2021-12-28.


In [2]:
home = '/casa/bsc/projects/2_Structure-of-pri/2007_paper_prep'
%cd $home

/casa/bsc/projects/2_Structure-of-pri/2007_paper_prep


In [3]:
import sys; sys.path.append('/casa/bsc/notebooks/')
from basic import gen_result_dir
resultpath = gen_result_dir('results/')
print 'resultpath:', resultpath

resultpath: results/211228/


In [4]:
from __future__ import division
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import seaborn as sns
import re
import gzip

In [5]:
COMMON1 = 'AGGGTCTACCGGGCCACCGC'
COMMON2 = 'CCCCAGGTGTGTGGTTTTTA'
WCPAIRS = [ 'GC', 'CG', 'AT', 'TA' ]
WBPAIRS = [ 'GT', 'TG' ]

In [6]:
STRS = defaultdict(lambda: 'S')
STRS.update({p:'D' for p in WCPAIRS})#+WBPAIRS})
templates = defaultdict(lambda: 'unknown')
templates.update({ 'AAAGGGACC':'watson', 'TGAGGCCTC':'crick', 'TGAGACCCT':'mir-125a' })

In [7]:
cols = ['barcode','pri']
dicttbl = pd.read_table(gzip.open('publication/Kwon2019/dict_split.txt.gz','rb'),sep='\t',names=cols)
dicttbl['template'] = dicttbl['pri'].apply(lambda x:templates[x[27:36]])
dicttbl = dicttbl[dicttbl['template']!='unknown']
print len(dicttbl)

13979388


In [8]:
watson = dicttbl[dicttbl['template']=='watson']
watson = watson[watson['pri'].apply(len)>=99]
crick = dicttbl[dicttbl['template']=='crick']
crick = crick[crick['pri'].apply(len)>=95]
mir125 = dicttbl[dicttbl['template']=='mir-125a']
mir125 = mir125[mir125['pri'].apply(len)>=95]
print len(watson), len(crick), len(mir125)

5757557 4634000 3586044


In [9]:
watson['ghg'] = watson['pri'].apply(lambda x:x[20:23])
watson['bj5'] = watson['pri'].apply(lambda x:x[13:15])
watson['bj3'] = watson['pri'].apply(lambda x:x[97:99][::-1])
watson['aj5'] = watson['pri'].apply(lambda x:x[48:51])
watson['aj3'] = watson['pri'].apply(lambda x:x[61:64][::-1])

crick['ghg'] = crick['pri'].apply(lambda x:x[16:19])
crick['bj5'] = crick['pri'].apply(lambda x:x[9:11])
crick['bj3'] = crick['pri'].apply(lambda x:x[93:95][::-1])
crick['aj5'] = crick['pri'].apply(lambda x:x[44:47])
crick['aj3'] = crick['pri'].apply(lambda x:x[57:60][::-1])

mir125['ghg'] = mir125['pri'].apply(lambda x:x[16:19])
mir125['bj5'] = mir125['pri'].apply(lambda x:x[9:11])
mir125['bj3'] = mir125['pri'].apply(lambda x:x[93:95][::-1])
mir125['aj5'] = mir125['pri'].apply(lambda x:x[46:49])
mir125['aj3'] = mir125['pri'].apply(lambda x:x[57:60][::-1])

dicttbl = watson.append(crick).append(mir125)
watson = 0; crick = 0; mir125 = 0
print len(dicttbl)

13977601


In [10]:
dicttbl = dicttbl[dicttbl['bj5'].apply(lambda x: 'A' not in x)]
dicttbl = dicttbl[dicttbl['bj3'].apply(lambda x: ('T' not in x) and ('G' not in x))]
dicttbl = dicttbl[dicttbl['aj5'].apply(lambda x: 'A' not in x)]
dicttbl = dicttbl[dicttbl['aj3'].apply(lambda x: ('T' not in x[:2]) and ('G' not in x[:2]))]
print len(dicttbl)

13621598


In [11]:
repbarcs = {}
for info,grp in dicttbl.groupby(['template','ghg','bj5','bj3','aj5','aj3']):
    barcs = grp['barcode'].tolist()
    rep = barcs[0]
    repbarcs.update({b:rep for b in barcs})
    
dicttbl = dicttbl.set_index('barcode')
dicttbl = dicttbl[~dicttbl.index.duplicated(keep=False)]
dicttbl = dicttbl.reindex(set(repbarcs.values())).dropna()
print len(dicttbl)

72652


In [12]:
inpcnts = defaultdict(int)
for l in gzip.open('publication/Kwon2019/input_counts.txt.gz','rb'):
    cnt,barc = l.strip().split()
    try:
        inpcnts[repbarcs[barc]] += int(cnt)
    except KeyError:
        continue

In [13]:
clvcnts = defaultdict(int)
for l in gzip.open('publication/Kwon2019/clv_counts.txt.gz','rb'):
    cnt,barc = l.strip().split()
    try:
        clvcnts[repbarcs[barc]] += int(cnt)
    except KeyError:
        continue

In [14]:
cut = 20
both = set(inpcnts.keys())&set(clvcnts.keys())
barcs = [b for b in both if inpcnts[b]>=cut]
print len(barcs)

35707


In [15]:
dicttbl['bj str 1'] = (dicttbl['bj5'].apply(lambda x:x[0])+
                      dicttbl['bj3'].apply(lambda x:x[0])).apply(lambda x:STRS[x])
dicttbl['bj str 2'] = (dicttbl['bj5'].apply(lambda x:x[1])+
                      dicttbl['bj3'].apply(lambda x:x[1])).apply(lambda x:STRS[x])
dicttbl['bj str'] = dicttbl['bj str 1']+dicttbl['bj str 2']
dicttbl['aj str 1'] = (dicttbl['aj5'].apply(lambda x:x[0])+
                      dicttbl['aj3'].apply(lambda x:x[0])).apply(lambda x:STRS[x])
dicttbl['aj str 2'] = (dicttbl['aj5'].apply(lambda x:x[1])+
                      dicttbl['aj3'].apply(lambda x:x[1])).apply(lambda x:STRS[x])
dicttbl['aj str 3'] = (dicttbl['aj5'].apply(lambda x:x[2])+
                      dicttbl['aj3'].apply(lambda x:x[2])).apply(lambda x:STRS[x])
dicttbl['aj str'] = dicttbl['aj str 1']+dicttbl['aj str 2']+dicttbl['aj str 3']
dicttbl['aj str'] = dicttbl['aj str'].apply(lambda x: (x+'S').find('S')*'D'+
                                                       (3-(x+'S').find('S'))*'S')
dicttbl = dicttbl.drop(['bj str 1','bj str 2','aj str 1','aj str 2','aj str 3','pri'],axis=1)
dicttbl.head(3)

Unnamed: 0_level_0,template,ghg,bj5,bj3,aj5,aj3,bj str,aj str
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CTTCTGTTCTGAAAGACGGGTCTTCGAGA,mir-125a,TAT,TG,CC,TCG,CAG,SD,SSS
AAACTATGATGAAGGGTGTGTCGGATTATGC,crick,TTT,GG,CC,TTT,ACA,DD,DSS
AACTCTGGGATACAGAAGTAATCACGCCTGA,watson,GTG,TC,CC,TGG,CCT,SS,SSS


In [16]:
dicttbl['UG1'] = dicttbl['bj5'].apply(lambda x:int(x[0]=='G'))
dicttbl['UG2'] = (dicttbl['bj5']=='TG').apply(int)
dicttbl['UG3'] = dicttbl['bj5'].apply(lambda x:int(x[1]=='T'))
dicttbl['UGU1'] = dicttbl['aj5'].apply(lambda x:int(x[:2]=='GT'))
dicttbl['UGU2'] = (dicttbl['aj5']=='TGT').apply(int)
dicttbl['UGU3'] = dicttbl['aj5'].apply(lambda x:int(x[1:]=='TG'))
dicttbl['UG'] = dicttbl['UG1']+dicttbl['UG2']+dicttbl['UG3']
dicttbl['UGU'] = dicttbl['UGU1']+dicttbl['UGU2']+dicttbl['UGU3']

In [17]:
efftbl = pd.DataFrame(columns=['input','cleaved'],index=barcs)
efftbl['input'] = pd.Series(inpcnts)
efftbl['cleaved'] = pd.Series(clvcnts)
efftbl['eff'] = efftbl['cleaved']/efftbl['input']
efftbl.head(3)

Unnamed: 0,input,cleaved,eff
AAGAGCTATTCCTATCGTGTTGATTAACGC,220,311,1.413636
AAACTATGATGAAGGGTGTGTCGGATTATGC,1425,618,0.433684
AAATTGTGTTAACGGTGAACTAGACTTTT,64,147,2.296875


In [18]:
dicttbl = dicttbl.join(efftbl,how='inner').dropna()
print len(dicttbl)

25831


In [19]:
dicttbl.to_csv('publication/Kwon2019/var_processing.csv')

---