In [5]:
import os
import sys
import subprocess
import epicBarcoder as eb

In [6]:
env = os.environ
dataDir = '/home/ubuntu/users/sjspence/170214_OM8/03_jupyter/'
pearPath = '/usr/local/bin/pear'
#usearchPath = '/home/ubuntu/bin/usearch8'
usearchPath = '/home/ubuntu/users/sjspence/tools/usearch9.2.64_i86linux32'
sinaPath = '/home/ubuntu/bin/sina-1.2.11/sina'
fasttreePath = '/home/ubuntu/bin/FastTree_dd'

In [None]:
#Join paired-end reads
subprocess.call([pearPath, '-f', dataDir + '170214Alm_D17-2046_1_sequence.fastq', 
                 '-r', dataDir + '170214Alm_D17-2046_2_sequence.fastq', '-o', dataDir + '01_pear'])

In [16]:
#Break up file into pieces that usearch can use (5 million lines each)
#Run this to completion before running next section
inFile = open(dataDir + '01_pear.assembled.fastq', 'r')
if not os.path.exists(dataDir + '02_pearSplits/'):
    os.makedirs(dataDir + '02_pearSplits/')
i = 0
j = 1
partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
for line in inFile:
    if i >= j*5000000:
        partFile.close()
        j += 1
        partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
    partFile.write(line)
    i += 1
partFile.close()
inFile.close()
print(i)

71812424


In [23]:
#Quality filter with usearch 9 max-error rate
def qualFilter(inFile, outFile):
    subprocess.call([usearchPath, "-fastq_filter", inFile, "-fastq_minlen", '100', '-fastq_maxee_rate', '0.01',
                     "-fastqout", outFile], env=env)
for filename in os.listdir(dataDir + '02_pearSplits/'):
    qualFilter(dataDir + '02_pearSplits/' + filename, dataDir + '02_pearSplits/' + filename.replace('.fastq','filt.fastq'))

In [25]:
#Join quality-filtered files back into a single file for processing
with open(dataDir + '02_pear_filt.fastq', 'w') as outfile:
    for fname in os.listdir(dataDir + '02_pearSplits/'):
        if 'filt' in fname:
            with open(dataDir + '02_pearSplits/' + fname, 'r') as infile:
                for line in infile:
                    outfile.write(line)
            infile.close()
outfile.close()

In [5]:
#Import and edit mapping file
sampIDs = []
mapping = {}
readCounts = {}
with open(dataDir + 'OM8_map.txt', 'r') as inFile:
    for line in inFile:
        if '#' not in line:
            line = line.strip().split('\t')
            mapping[line[1]] = line[0].replace('_','s')
            readCounts[line[1]] = 0
            sampIDs.append(line[0].replace('_','s'))
inFile.close()

In [38]:
#Demultiplex: check for barcodes and relabel sequences
#Use mapping file to keep barcoded sequences, prepare fasta file
with open(dataDir + '02_pear_filt.fastq', 'r') as inFile:
    with open(dataDir + '03_pear_filt.fasta', 'w') as outFile:
        i = 0
        j = 0
        nextSeq = False
        for line in inFile:
            if nextSeq:
                outFile.write(line)
                nextSeq = False
            if i%4 == 0:
                for bc in mapping:
                    if bc in line:
                        readCounts[bc] += 1
                        newLine = line.strip().replace('@','>' + mapping[bc] + '_' + str(j) + ' ')
                        newLine = newLine + ' orig_bc=' + bc + ' new_bc=' + bc + ' bc_diffs=0\n'
                        outFile.write(newLine)
                        nextSeq = True
                        j += 1
            i += 1
inFile.close()
outFile.close()
#Summarize read mapping after quality filtering and zero-error barcode matching
total = 0
summaryFile = open(dataDir + '03_quality_summary.txt', 'w')
for s in sampIDs:
    for bc in mapping:
        if mapping[bc] == s:
            summaryFile.write(s + '\t' + str(readCounts[bc]) + '\n')
            total += readCounts[bc]
summaryFile.write('Total\t' + str(total))
summaryFile.close()

In [3]:
#Primer check and removal, placing droplet barcode into header
#NOTE: this takes a while
qualReads = eb.importFasta(dataDir + '03_pear_filt.fasta')
noPrimerReads = eb.filtBarcodePrimers(qualReads, 20, 'GATCATGACCCATTTGGAGAAGATG', 'GGACTACHVGGGTWTCTAAT')
eb.exportFasta(noPrimerReads, dataDir + '04_pear_noPrimers.fasta')
print(len(qualReads))
print(len(noPrimerReads))
print(noPrimerReads[0].header)
print(noPrimerReads[0].seq)

In [5]:
#Collapse identical reads and maintain the provenance to reduce the uclust file size
#uniqueDict maps a unique sequence to a list of read objects which contain it
#NOTE: takes a long time, but rerun after notebook closes out
noPrimerReads = eb.importFasta(dataDir + '04_pear_noPrimers.fasta')
uniqueDict = eb.getUniqueSeqs(noPrimerReads, dataDir + '05_unique_seqs.fasta')

In [3]:
#Use the usearch unoise algorithm to create zero radius OTUs (zOTUs), while also discarding chimeras, phiX sequences,
#and low complexity DNA
subprocess.call([usearchPath, '-unoise2', dataDir + '05_unique_seqs.fasta', '-fastaout', dataDir + '06_denoised.fa',
                 '-otudbout', dataDir + '06_db.fa', '-minampsize', '1'], env=env)

0

**Unoise output**  
00:02 332Mb   100.0% Reading 05_unique_seqs.fasta  
01:04 637Mb   100.0% 27163 amplicons, 2096896 bad (size >= 1)  
46:06 650Mb   100.0% 14693 good, 12470 chimeras

245334 corrected amplicon sequences (including chimeras) in 06_db.fa  
14693 output biological sequences in 06_denoised.fa

In [None]:
#START HERE!!!

In [6]:
#Create a dictionary mapping OTU ids to all the headers from unique sequences that map within the OTU
otuToHeaders = eb.uniqueSeqsToOTU(dataDir + '06_combined_otus.up')
print(len(otuToHeaders.keys()))
print(otuToHeaders['OTU2'][10])

4446
OM8s01_171 HWI-M04407:1:2105:25283:4847#ACTTATTG/1 orig_bc=ACTTATTG new_bc=ACTTATTG bc_diffs=0 droplet_bc=ATACTCCGTAATGCGAGATC;size=42513;


In [10]:
#Write out a file that assigns all non-chimeric read headers to an OTU in a tab-delimited file
unique = eb.importFasta(dataDir + '05_unique_seqs.fasta')
headerDict = {}
for u in unique:
    headerDict[u.header.replace('>', '')] = u.seq
with open(dataDir + '07_all_seq_otus.txt', 'w') as f:
    for otu in otuToHeaders:                  #otus mapped to unique seq headers
        for header in otuToHeaders[otu]:      #list of unique seq headers for each otu
            seq = headerDict[header]          #get sequence associated with unique seq header
            for read in uniqueDict[seq]:      #list of reads associated with a unique sequence
                f.write(read.header + '\t' + otu + '\n')
f.close()

**Assign taxonomy with method=mothur on qiime**

* Copy to qiime node, run assign taxonomy with a kmer-based algorithm (RDP or mothur naive Bayes)
* RDP needed a taxonomy unavailable from silva, so tried the mothur implementation.

$ assign_taxonomy.py -i 06_combined_otus.fa -o 06_taxonomy -m mothur -r HOMD_16S_rRNA_RefSeq_V14.5.p9_spike.fasta -t HOMD_16S_rRNA_RefSeq_V14.5.mothur_spike.taxonomy

In [3]:
#Within each sample, group by barcode; quantify unique barcode pairings
#Need a structure like [sampID1, sampID2, etc.]
#Then each sampID maps to a dictionary of droplet barcodes:[otu1, otu2]
barcodeSamples = {}
with open(dataDir + '07_all_seq_otus.txt', 'r') as f:
    for line in f:
        line = line.strip().split('\t')
        samp = line[0].split('_')[0].replace('>','')
        bc = line[0].split('droplet_bc=')[1]
        otu = line[1]
        if samp not in barcodeSamples:
            barcodeSamples[samp] = {bc:[otu]}
        else:
            if bc not in barcodeSamples[samp]:
                barcodeSamples[samp][bc] = [otu]
            else:
                barcodeSamples[samp][bc].append(otu)

In [9]:
#Quantify unique pairings
pairDicts = {}
for s in sampIDs:
    if s in barcodeSamples:
        uniquePairs = {}
        for bc in barcodeSamples[s]:
            if len(barcodeSamples[s][bc]) != 1:
                uniqueOTUs = set(barcodeSamples[s][bc])
                if len(uniqueOTUs) != 1:
                    pairString = '_'.join(list(uniqueOTUs))
                    if pairString not in uniquePairs:
                        uniquePairs[pairString] = 1
                    else:
                        uniquePairs[pairString] += 1
        pairDicts[s] = uniquePairs

In [13]:
#Import taxonomic information
taxDict = eb.importTaxonomy(dataDir + '06_taxonomy/06_combined_otus_tax_assignments.txt', 'mothur')

In [7]:
#OTU1 is s. onei (#OTU2009, OTU2008)
#OTU60 is b. sub...is it anywhere?
#OTU1512 is definitely e. coli
#OTU2009, OTU1865, OTU3394, OTU2732, OTU989 may be e. coli

1