In [1]:
import os
import sys
import subprocess
import epicBarcoder as eb

In [2]:
env = os.environ
dataDir = '/home/ubuntu/users/sjspence/170214_OM8/03_jupyter/'
pearPath = '/usr/local/bin/pear'
#usearchPath = '/home/ubuntu/bin/usearch8'
usearchPath = '/home/ubuntu/users/sjspence/tools/usearch9.2.64_i86linux32'
sinaPath = '/home/ubuntu/bin/sina-1.2.11/sina'
fasttreePath = '/home/ubuntu/bin/FastTree_dd'

In [None]:
#Join paired-end reads
subprocess.call([pearPath, '-f', dataDir + '170214Alm_D17-2046_1_sequence.fastq', 
                 '-r', dataDir + '170214Alm_D17-2046_2_sequence.fastq', '-o', dataDir + '01_pear'])

In [16]:
#Break up file into pieces that usearch can use (5 million lines each)
#Run this to completion before running next section
inFile = open(dataDir + '01_pear.assembled.fastq', 'r')
if not os.path.exists(dataDir + '02_pearSplits/'):
    os.makedirs(dataDir + '02_pearSplits/')
i = 0
j = 1
partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
for line in inFile:
    if i >= j*5000000:
        partFile.close()
        j += 1
        partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
    partFile.write(line)
    i += 1
partFile.close()
inFile.close()
print(i)

71812424


In [23]:
#Quality filter with usearch 9 max-error rate
def qualFilter(inFile, outFile):
    subprocess.call([usearchPath, "-fastq_filter", inFile, "-fastq_minlen", '100', '-fastq_maxee_rate', '0.01',
                     "-fastqout", outFile], env=env)
for filename in os.listdir(dataDir + '02_pearSplits/'):
    qualFilter(dataDir + '02_pearSplits/' + filename, dataDir + '02_pearSplits/' + filename.replace('.fastq','filt.fastq'))

In [25]:
#Join quality-filtered files back into a single file for processing
with open(dataDir + '02_pear_filt.fastq', 'w') as outfile:
    for fname in os.listdir(dataDir + '02_pearSplits/'):
        if 'filt' in fname:
            with open(dataDir + '02_pearSplits/' + fname, 'r') as infile:
                for line in infile:
                    outfile.write(line)
            infile.close()
outfile.close()

In [8]:
#Import and edit mapping file
sampIDs = []
mapping = {}
readCounts = {}
with open(dataDir + 'OM8_map.txt', 'r') as inFile:
    for line in inFile:
        if '#' not in line:
            line = line.strip().split('\t')
            mapping[line[1]] = line[0].replace('_','s')
            readCounts[line[1]] = 0
            sampIDs.append(line[0].replace('_','s'))
inFile.close()

In [38]:
#Demultiplex: check for barcodes and relabel sequences
#Use mapping file to keep barcoded sequences, prepare fasta file
with open(dataDir + '02_pear_filt.fastq', 'r') as inFile:
    with open(dataDir + '03_pear_filt.fasta', 'w') as outFile:
        i = 0
        j = 0
        nextSeq = False
        for line in inFile:
            if nextSeq:
                outFile.write(line)
                nextSeq = False
            if i%4 == 0:
                for bc in mapping:
                    if bc in line:
                        readCounts[bc] += 1
                        newLine = line.strip().replace('@','>' + mapping[bc] + '_' + str(j) + ' ')
                        newLine = newLine + ' orig_bc=' + bc + ' new_bc=' + bc + ' bc_diffs=0\n'
                        outFile.write(newLine)
                        nextSeq = True
                        j += 1
            i += 1
inFile.close()
outFile.close()
#Summarize read mapping after quality filtering and zero-error barcode matching
total = 0
summaryFile = open(dataDir + '03_quality_summary.txt', 'w')
for s in sampIDs:
    for bc in mapping:
        if mapping[bc] == s:
            summaryFile.write(s + '\t' + str(readCounts[bc]) + '\n')
            total += readCounts[bc]
summaryFile.write('Total\t' + str(total))
summaryFile.close()

In [3]:
#Primer check and removal, placing droplet barcode into header
#NOTE: this takes a while
qualReads = eb.importFasta(dataDir + '03_pear_filt.fasta')
noPrimerReads = eb.filtBarcodePrimers(qualReads, 20, 'GATCATGACCCATTTGGAGAAGATG', 'GGACTACHVGGGTWTCTAAT')
eb.exportFasta(noPrimerReads, dataDir + '04_pear_noPrimers.fasta')
print(len(qualReads))
print(len(noPrimerReads))
print(noPrimerReads[0].header)
print(noPrimerReads[0].seq)

In [3]:
#Collapse identical reads and maintain the provenance to reduce the uclust file size
#uniqueDict maps a unique sequence to a list of read objects which contain it
#NOTE: takes a long time, but rerun after notebook closes out
noPrimerReads = eb.importFasta(dataDir + '04_pear_noPrimers.fasta')
uniqueDict = eb.getUniqueSeqs(noPrimerReads, dataDir + '05_unique_seqs.fasta')

In [3]:
#Use the usearch unoise algorithm to create zero radius OTUs (zOTUs), while also discarding chimeras, phiX sequences,
#and low complexity DNA
#Input: unique sequences collapsed from quality- and primer- filtered data
#Output: Denoised file with true biological reads
#        Database file with true amplicon reads including chimeras
subprocess.call([usearchPath, '-unoise2', dataDir + '05_unique_seqs.fasta', '-fastaout', dataDir + '06_denoised.fa',
                 '-otudbout', dataDir + '06_db.fa', '-minampsize', '1'], env=env)

0

**Unoise output**  
00:02 332Mb   100.0% Reading 05_unique_seqs.fasta  
01:04 637Mb   100.0% 27163 amplicons, 2096896 bad (size >= 1)  
46:06 650Mb   100.0% 14693 good, 12470 chimeras

245334 corrected amplicon sequences (including chimeras) in 06_db.fa  
14693 output biological sequences in 06_denoised.fa

In [7]:
#Format fasta database for input to SINTAX
#>AB008314;tax=d:Bacteria,p:Firmicutes,c:Bacilli,o:Lactobacillales,f:Streptococcaceae,g:Streptococcus;
#Maintained HOMD HOT strain ID in header following the taxonomic information
outFile = open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.fasta', 'w')
taxDict = {}
with open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.qiime_spike.taxonomy', 'r') as t:
    for line in t:
        line = line.strip().split('\t')
        taxID = line[0]
        tax = line[1].strip().replace('__',':')
        tax = tax.replace(';',',')
        taxDict[taxID] = tax
with open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_spike.fasta', 'r') as f:
    for line in f:
        if '>' in line:
            line = line.strip().split(' ')
            taxInfo = taxDict[line[0].replace('>','')]
            outLine = line[0] + ';tax=' + taxInfo + ';'
            for i in line:
                if 'HOT' in i:
                    outLine += i + ';'
            outFile.write(outLine + '\n')
        else:
            outFile.write(line)
outFile.close()
subprocess.call([usearchPath, '-makeudb_sintax', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.fasta', 
                 '-output', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.udb'], env=env)

0

**Database formatting output**  
00:00 14Mb   1020 names, tax levels min 7, avg 7.0, max 7  
WARNING: 25 taxonomy nodes have >1 parent  
00:00 14Mb   Buffers (892 seqs)

In [8]:
#Run SINTAX to determine denoised read taxonomic information
#Default is to run one thread per CPU core, or 10 threads if there are > 10 cores
subprocess.call([usearchPath, '-sintax', dataDir + '06_denoised.fa', 
                 '-db', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.udb', 
                 '-tabbedout', dataDir + '07_denoised.sintax', 
                 '-strand', 'plus', '-sintax_cutoff', '0.8', '-threads', '4'], env=env)

0

In [4]:
#Combine taxonomic information to export final file with droplet barcodes and taxonomies
#06_denoised.fa: header matches the first tabbed column of sintax output (minus '>'), sequence follows
#Import list of read objects from unoise2 denoised file
denoised = eb.importFasta(dataDir + '06_denoised.fa')

#Import Otu header:[tax probabilities, taxonomy] dictionary from SINTAX output
taxDict = eb.importSintax(dataDir + '07_denoised.sintax')

#Take denoised zOTUs and taxonomic information, then map back to original reads and rewrite original read file with
#zOTU and taxonomic information in the headers
eb.otuToHeaders(denoised, taxDict, uniqueDict, dataDir + '08_all_seqs_tax.fa')

**Read loss from unoise2**

24490106 04_pear_noPrimers.fasta = 12,245,053 reads prior to unoise2  
18257786 08_allSeqsTax.fa = 9,128,893 reads after unoise2  

Approximately 25% read loss.

In [5]:
#Within each sample, group by barcode; quantify unique barcode pairings
#Need a structure like [sampID1, sampID2, etc.]
#Then each sampID maps to a dictionary of droplet barcodes:[otu1, otu2]
def createBarcodeDict(inFileName):
    inFile = open(inFileName, 'r')
    barcodeSamples = {}
    for line in inFile:
        if '>' in line:
            line = line.strip().split(';')
            samp = line[0].split('_')[0].replace('>','')
            bc = line[0].split('droplet_bc=')[1]
            otu = line[1]
            tax = line[2].replace('tax=','')
            if samp not in barcodeSamples:
                barcodeSamples[samp] = {bc:[[otu, tax]]}
            else:
                if bc not in barcodeSamples[samp]:
                    barcodeSamples[samp][bc] = [[otu, tax]]
                else:
                    barcodeSamples[samp][bc].append([otu, tax])
    inFile.close()
    return barcodeSamples

In [7]:
barcodeDict = createBarcodeDict(dataDir + '08_all_seqs_tax.fa')

In [10]:
#Print number of barcodes per sample
for s in sampIDs:
    if s in barcodeDict:
        sample = barcodeDict[s]
        print(s + '\t' + str(len(sample)))

OM8s01	31613
OM8s02	53727
OM8s03	39763
OM8s04	56228
OM8s05	42798
OM8s06	32556
OM8s07	41151
OM8s08	15048
OM8s09	26675
OM8s10	19429
OM8s11	49765
OM8s12	25560
OM8s13	24283
OM8s14	37869
OM8s15	5625
OM8s16	11355
OM8s17	69740
OM8s18	28834
OM8s19	40
OM8s20	83
OM8s21	221776
OM8s22	71263
OM8s23	85362
OM8s24	58431
OM8s25	145900
OM8s26	82931
OM8s27	7
OM8s28	29651
OM8s29	146
OM8s30	68
OM8s31	9
OM8s32	1
OM8s34	2


In [3]:
#Print number of barcodes per sample
pairDicts = {}
for s in sampIDs:
    if s in barcodeDict:
        uniquePairs = {}
        for bc in barcodeDict[s]:
            if len(barcodeDict[s][bc]) != 1:
                for otu in barcodeDict[s][bc]:
                    

In [9]:
#Quantify unique pairings
pairDicts = {}
for s in sampIDs:
    if s in barcodeSamples:
        uniquePairs = {}
        for bc in barcodeSamples[s]:
            if len(barcodeSamples[s][bc]) != 1:
                uniqueOTUs = set(barcodeSamples[s][bc])
                if len(uniqueOTUs) != 1:
                    pairString = '_'.join(list(uniqueOTUs))
                    if pairString not in uniquePairs:
                        uniquePairs[pairString] = 1
                    else:
                        uniquePairs[pairString] += 1
        pairDicts[s] = uniquePairs