**Plaque Barcoding pipeline**  
Follow along our analysis steps and reproduce our results with the scripts below.  

**Dependencies**
* epicBarcoder (custom library on github)
* pandas
* pear
* usearch v9.2
* sina v1.2.11
* fasttree

In [29]:
import os
import sys
import subprocess
import epicBarcoder as eb
from itertools import combinations
from scipy.stats import poisson
import pandas as pd

In [2]:
#Set up personal computing environment with paths to necessary tools and data directories
env = os.environ
dataDir = '/home/ubuntu/users/sjspence/170214_OM8/04_jupyter/'
pearPath = '/usr/local/bin/pear'
usearchPath = '/home/ubuntu/users/sjspence/tools/usearch9.2.64_i86linux32'
sinaPath = '/home/ubuntu/bin/sina-1.2.11/sina'
fasttreePath = '/home/ubuntu/bin/FastTree_dd'

In [8]:
#Import and edit mapping file
sampIDs = []
mapping = {}
readCounts = {}
with open(dataDir + 'OM8_map.txt', 'r') as inFile:
    for line in inFile:
        if '#' not in line:
            line = line.strip().split('\t')
            mapping[line[1]] = line[0].replace('_','s')
            readCounts[line[1]] = 0
            sampIDs.append(line[0].replace('_','s'))
inFile.close()

In [None]:
#Join paired-end reads
subprocess.call([pearPath, '-f', dataDir + '170214Alm_D17-2046_1_sequence.fastq', 
                 '-r', dataDir + '170214Alm_D17-2046_2_sequence.fastq', '-o', dataDir + '01_pear'])

In [16]:
#Break up file into pieces that usearch can use (5 million lines each)
#Run this to completion before running next section
inFile = open(dataDir + '01_pear.assembled.fastq', 'r')
if not os.path.exists(dataDir + '02_pearSplits/'):
    os.makedirs(dataDir + '02_pearSplits/')
i = 0
j = 1
partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
for line in inFile:
    if i >= j*5000000:
        partFile.close()
        j += 1
        partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
    partFile.write(line)
    i += 1
partFile.close()
inFile.close()
print(i)

71812424


In [23]:
#Quality filter with usearch 9 max-error rate
def qualFilter(inFile, outFile):
    subprocess.call([usearchPath, "-fastq_filter", inFile, "-fastq_minlen", '100', '-fastq_maxee_rate', '0.01',
                     "-fastqout", outFile], env=env)
for filename in os.listdir(dataDir + '02_pearSplits/'):
    qualFilter(dataDir + '02_pearSplits/' + filename, dataDir + '02_pearSplits/' + filename.replace('.fastq','filt.fastq'))

In [25]:
#Join quality-filtered files back into a single file for processing
with open(dataDir + '02_pear_filt.fastq', 'w') as outfile:
    for fname in os.listdir(dataDir + '02_pearSplits/'):
        if 'filt' in fname:
            with open(dataDir + '02_pearSplits/' + fname, 'r') as infile:
                for line in infile:
                    outfile.write(line)
            infile.close()
outfile.close()

In [38]:
#Demultiplex: check for barcodes and relabel sequences
#Use mapping file to keep barcoded sequences, prepare fasta file
with open(dataDir + '02_pear_filt.fastq', 'r') as inFile:
    with open(dataDir + '03_pear_filt.fasta', 'w') as outFile:
        i = 0
        j = 0
        nextSeq = False
        for line in inFile:
            if nextSeq:
                outFile.write(line)
                nextSeq = False
            if i%4 == 0:
                for bc in mapping:
                    if bc in line:
                        readCounts[bc] += 1
                        newLine = line.strip().replace('@','>' + mapping[bc] + '_' + str(j) + ' ')
                        newLine = newLine + ' orig_bc=' + bc + ' new_bc=' + bc + ' bc_diffs=0\n'
                        outFile.write(newLine)
                        nextSeq = True
                        j += 1
            i += 1
inFile.close()
outFile.close()
#Summarize read mapping after quality filtering and zero-error barcode matching
total = 0
summaryFile = open(dataDir + '03_quality_summary.txt', 'w')
for s in sampIDs:
    for bc in mapping:
        if mapping[bc] == s:
            summaryFile.write(s + '\t' + str(readCounts[bc]) + '\n')
            total += readCounts[bc]
summaryFile.write('Total\t' + str(total))
summaryFile.close()

In [3]:
#Primer check and removal, placing droplet barcode into header
#NOTE: this takes a while
qualReads = eb.importFasta(dataDir + '03_pear_filt.fasta')
noPrimerReads = eb.filtBarcodePrimers(qualReads, 20, 'GATCATGACCCATTTGGAGAAGATG', 'GGACTACHVGGGTWTCTAAT')
eb.exportFasta(noPrimerReads, dataDir + '04_pear_noPrimers.fasta')
print(len(qualReads))
print(len(noPrimerReads))
print(noPrimerReads[0].header)
print(noPrimerReads[0].seq)

In [3]:
#Collapse identical reads and maintain the provenance to reduce the uclust file size
#uniqueDict maps a unique sequence to a list of read objects which contain it
#NOTE: takes a long time, but rerun after notebook closes out
noPrimerReads = eb.importFasta(dataDir + '04_pear_noPrimers.fasta')
uniqueDict = eb.getUniqueSeqs(noPrimerReads, dataDir + '05_unique_seqs.fasta')

In [3]:
#Use the usearch unoise algorithm to create zero radius OTUs (zOTUs), while also discarding chimeras, phiX sequences,
#and low complexity DNA
#Input: unique sequences collapsed from quality- and primer- filtered data
#Output: Denoised file with true biological reads
#        Database file with true amplicon reads including chimeras
subprocess.call([usearchPath, '-unoise2', dataDir + '05_unique_seqs.fasta', '-fastaout', dataDir + '06_denoised.fa',
                 '-otudbout', dataDir + '06_db.fa', '-minampsize', '3'], env=env)

0

**Unoise output**  
00:02 332Mb   100.0% Reading 05_unique_seqs.fasta  
00:04 352Mb   100.0% 3955 amplicons, 1791728 bad (size >= 3)
01:55 359Mb   100.0% 354 good, 3601 chimeras

46735 corrected amplicon sequences (including chimeras) in 06_db.fa  
354 output biological sequences in 06_denoised.fa

In [7]:
#NOTE: Only need to do this once

#Format fasta database for input to SINTAX
#Maintained HOMD HOT strain ID in header following the taxonomic information
#Example SINTAX header structure below:
#>AB008314;tax=d:Bacteria,p:Firmicutes,c:Bacilli,o:Lactobacillales,f:Streptococcaceae,g:Streptococcus;
outFile = open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.fasta', 'w')
taxDict = {}
with open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.qiime_spike.taxonomy', 'r') as t:
    for line in t:
        line = line.strip().split('\t')
        taxID = line[0]
        tax = line[1].strip().replace('__',':')
        tax = tax.replace(';',',')
        taxDict[taxID] = tax
with open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_spike.fasta', 'r') as f:
    for line in f:
        if '>' in line:
            line = line.strip().split(' ')
            taxInfo = taxDict[line[0].replace('>','')]
            outLine = line[0] + ';tax=' + taxInfo + ';'
            for i in line:
                if 'HOT' in i:
                    outLine += i + ';'
            outFile.write(outLine + '\n')
        else:
            outFile.write(line)
outFile.close()
subprocess.call([usearchPath, '-makeudb_sintax', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.fasta', 
                 '-output', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.udb'], env=env)

0

**Database formatting output**  
00:00 14Mb   1020 names, tax levels min 7, avg 7.0, max 7  
WARNING: 25 taxonomy nodes have >1 parent  
00:00 14Mb   Buffers (892 seqs)

In [4]:
#Run SINTAX to determine denoised read taxonomic information
#Default is to run one thread per CPU core, or 10 threads if there are > 10 cores
subprocess.call([usearchPath, '-sintax', dataDir + '06_denoised.fa', 
                 '-db', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.udb', 
                 '-tabbedout', dataDir + '07_denoised.sintax', 
                 '-strand', 'plus', '-sintax_cutoff', '0.8', '-threads', '4'], env=env)

0

In [4]:
#Combine taxonomic information to export final file with droplet barcodes and taxonomies
#06_denoised.fa: header matches the first tabbed column of sintax output (minus '>'), sequence follows
#Import list of read objects from unoise2 denoised file
denoised = eb.importFasta(dataDir + '06_denoised.fa')

#Import Otu header:[tax probabilities, taxonomy] dictionary from SINTAX output
taxDict = eb.importSintax(dataDir + '07_denoised.sintax')

#Take denoised zOTUs and taxonomic information, then map back to original reads and rewrite original read file with
#zOTU and taxonomic information in the headers
eb.otuToHeaders(denoised, taxDict, uniqueDict, dataDir + '08_all_seqs_tax.fa')

**Read loss from unoise2**

24490106 04_pear_noPrimers.fasta = 12,245,053 reads prior to unoise2  
18228922 08_all_seqs_tax.fa = 9,114,461 reads after unoise2  

Approximately 25% read loss.

In [5]:
#Within each sample, group by barcode; quantify unique barcode pairings
#Input: Fasta file with droplet barcode, otu, and taxonomic information in the header
#Output: A dictionary where each sampID maps to a dictionary of droplet barcodes:[otu1, otu2]
#        A dictionary where each OTU maps to a >80% taxonomy
def createBarcodeDict(inFileName):
    inFile = open(inFileName, 'r')
    barcodeSamples = {}
    taxDict = {}
    for line in inFile:
        if '>' in line:
            line = line.strip().split(';')
            samp = line[0].split('_')[0].replace('>','')
            bc = line[0].split('droplet_bc=')[1]
            otu = line[1]
            tax = line[2].replace('tax=','')
            if samp not in barcodeSamples:
                barcodeSamples[samp] = {bc:[otu]}
            else:
                if bc not in barcodeSamples[samp]:
                    barcodeSamples[samp][bc] = [otu]
                else:
                    barcodeSamples[samp][bc].append(otu)
            if otu not in taxDict:
                taxDict[otu] = tax
    inFile.close()
    return barcodeSamples, taxDict

In [6]:
barcodeDict, taxDict = createBarcodeDict(dataDir + '08_all_seqs_tax.fa')

In [9]:
#Print number of barcodes per sample
for s in sampIDs:
    if s in barcodeDict:
        sample = barcodeDict[s]
        print(s + '\t' + str(len(sample)))

OM8s01	31521
OM8s02	53460
OM8s03	39500
OM8s04	55974
OM8s05	42590
OM8s06	32403
OM8s07	41010
OM8s08	14979
OM8s09	26614
OM8s10	19314
OM8s11	49619
OM8s12	25415
OM8s13	24261
OM8s14	37860
OM8s15	5618
OM8s16	11338
OM8s17	69714
OM8s18	28819
OM8s19	40
OM8s20	82
OM8s21	221187
OM8s22	70931
OM8s23	85035
OM8s24	58186
OM8s25	145759
OM8s26	82787
OM8s27	7
OM8s28	29627
OM8s29	145
OM8s30	68
OM8s31	9
OM8s32	1
OM8s34	2


In [10]:
#Quantify unique pairings
pairDicts = {}
for s in sampIDs:
    if s in barcodeDict:
        uniquePairs = {}
        for bc in barcodeDict[s]:
            if len(barcodeDict[s][bc]) != 1:
                uniqueOTUs = set(barcodeDict[s][bc])
                if len(uniqueOTUs) != 1:
                    pairString = '_'.join(list(uniqueOTUs))
                    if pairString not in uniquePairs:
                        uniquePairs[pairString] = 1
                    else:
                        uniquePairs[pairString] += 1
        pairDicts[s] = uniquePairs

In [11]:
#Calculate abundances of background OTUs based on singleton barcodes
abundances = {}
totals = {}
for s in sampIDs:
    if s not in barcodeDict:
        continue
    total = 0
    backgroundOTU = {}
    for bc in barcodeDict[s]:
        if len(barcodeDict[s][bc]) == 1:
            otu = barcodeDict[s][bc][0]
            if otu not in backgroundOTU:
                backgroundOTU[otu] = 1
            else:
                backgroundOTU[otu] += 1
            total += 1
    abundances[s] = backgroundOTU
    totals[s] = total

In [12]:
#Singleton counts for different samples
for s in sampIDs:
    if s in totals:
        print(s + ': ' + str(totals[s]))

OM8s01: 11474
OM8s02: 39245
OM8s03: 30574
OM8s04: 42100
OM8s05: 27009
OM8s06: 22064
OM8s07: 36327
OM8s08: 5744
OM8s09: 13476
OM8s10: 17249
OM8s11: 43814
OM8s12: 22287
OM8s13: 19947
OM8s14: 18799
OM8s15: 3269
OM8s16: 6909
OM8s17: 51128
OM8s18: 20161
OM8s19: 32
OM8s20: 72
OM8s21: 127814
OM8s22: 56249
OM8s23: 59647
OM8s24: 40453
OM8s25: 95863
OM8s26: 57738
OM8s27: 6
OM8s28: 14580
OM8s29: 74
OM8s30: 39
OM8s31: 5
OM8s32: 1
OM8s34: 2


In [13]:
#Convert abundance counts to relative abundances
relAbundances = {}
for s in sampIDs:
    if s in abundances:
        relAbund = {}
        for otu in abundances[s]:
            relAbund[otu] = float(abundances[s][otu]) / totals[s]
        relAbundances[s] = relAbund

In [14]:
#Create dictionary mapping unique otu pairs to number of barcodes supporting it
pairDict = {}
for s in sampIDs:
    if s not in barcodeDict:
        continue
    pairs = {}
    for bc in barcodeDict[s]:
        otuList = barcodeDict[s][bc]
        if len(otuList) > 1:
            uniqueOTUs = list(set(otuList))
            if len(uniqueOTUs) > 1:
                pairList = ['_'.join(list(comb)) for comb in combinations(uniqueOTUs, 2)]
                for p in pairList:
                    if p not in pairs:
                        pairs[p] = 1
                    else:
                        pairs[p] += 1
    pairDict[s] = pairs

In [15]:
#Calculate poisson probabilities that two bugs would co-occur and filter results based on that
#Print tab-delimited format showing total #pairs, #significant pairs, #shew to other, #shew to self
cutoff = 0.00001
for s in sampIDs:
    if s not in pairDict:
        continue
    i = 0
    t = 0
    shew = 0
    doubleShew = 0
    for otuPair in pairDict[s]:
        t += 1
        otu1 = otuPair.split('_')[0]
        otu2 = otuPair.split('_')[1]
        if otu1 in relAbundances[s]:
            a1 = relAbundances[s][otu1]
        else:
            a1 = 0.0
        if otu2 in relAbundances[s]:
            a2 = relAbundances[s][otu2]
        else:
            a2 = 0.0
        x = pairDict[s][otuPair]
        mu = a1 * a2 * totals[s]
        p = poisson.pmf(x, mu)
        if p < cutoff:
            i += 1
            if ('oneidensis' in taxDict[otu1]) and ('oneidensis' in taxDict[otu2]):
                doubleShew += 1
            elif ('oneidensis' in taxDict[otu1]) or ('oneidensis' in taxDict[otu2]):
                shew += 1
    print(s + '\t' + str(t) + '\t' + str(i) + '\t' + str(shew) + '\t' + str(doubleShew))

OM8s01	1311	311	68	4
OM8s02	1109	168	59	3
OM8s03	1132	160	47	3
OM8s04	1506	227	70	4
OM8s05	2466	494	102	6
OM8s06	2445	514	84	7
OM8s07	682	291	92	1
OM8s08	1614	403	66	1
OM8s09	462	59	2	0
OM8s10	205	67	29	1
OM8s11	1481	311	77	1
OM8s12	940	161	36	1
OM8s13	46	18	9	8
OM8s14	15	10	4	6
OM8s15	11	2	0	2
OM8s16	13	10	4	6
OM8s17	21	14	7	7
OM8s18	13	9	3	6
OM8s19	0	0	0	0
OM8s20	0	0	0	0
OM8s21	3462	1682	272	11
OM8s22	734	120	30	2
OM8s23	1420	220	67	10
OM8s24	1737	280	67	3
OM8s25	678	135	12	1
OM8s26	1084	242	11	1
OM8s27	0	0	0	0
OM8s28	9	6	3	1
OM8s29	3	0	0	0
OM8s30	0	0	0	0
OM8s31	0	0	0	0
OM8s32	0	0	0	0
OM8s34	0	0	0	0


In [16]:
#What are the s. oneidensis relative abundances?
for s in sampIDs:
    print(s)
    for otu in relAbundances[s]:
        if 'oneidensis' in taxDict[otu]:
            print(otu + '\t' + str(relAbundances[s][otu]))
    print('\n')

OM8s01
Otu1	0.0637092557086
Otu55	0.000435767822904
Otu3	0.046365696357
Otu73	0.000348614258323
Otu183	8.71535645808e-05


OM8s02
Otu73	0.00119760479042
Otu241	7.64428589629e-05
Otu55	0.00211491909797
Otu3	0.173372404128
Otu1	0.243037329596
Otu208	5.09619059753e-05
Otu217	2.54809529876e-05
Otu183	0.000203847623901


OM8s03
Otu329	3.27075292732e-05
Otu73	0.000425197880552
Otu241	6.54150585465e-05
Otu55	0.000981225878197
Otu3	0.0846797932884
Otu1	0.146562438673
Otu217	0.000130830117093
Otu183	0.000425197880552


OM8s04
Otu73	0.000926365795724
Otu241	2.37529691211e-05
Otu55	0.00121140142518
Otu3	0.120831353919
Otu1	0.21945368171
Otu183	0.000475059382423


OM8s05
Otu73	0.00159206190529
Otu317	3.70246954719e-05
Otu241	3.70246954719e-05
Otu55	0.00196230886001
Otu3	0.216039098078
Otu1	0.360842682069
Otu276	3.70246954719e-05
Otu217	0.000296197563775
Otu183	0.000333222259247


OM8s06
Otu338	4.5322697607e-05
Otu73	0.000407904278463
Otu55	0.000634517766497
Otu3	0.194479695431
Otu1	0.340328136331


KeyError: 'OM8s33'

NOTES: OK shewanella was divided into a bunch of OTUs, so probably makes sense to combine them for analysis.  This will likely increase the rel. abundance of S. oneidensis and reduce the significant connections in our poisson model.

In [17]:
#Try cluster_fast clustering
subprocess.call([usearchPath, '-cluster_fast', dataDir + '06_denoised.fa', '-id', '0.97', '-centroids', 
                dataDir + '09_otu_clusters.fa', '-uc', dataDir + '09_otu_clusters.uc'], env=env)

0

      Seqs  354  
  Clusters  154  
  Max size  17  
  Avg size  2.3  
  Min size  1  
Singletons  74, 20.9% of seqs, 48.1% of clusters  
   Max mem  83Mb  
      Time  1.00s  
Throughput  354.0 seqs/sec.

In [18]:
#Import list of read objects from unoise2 denoised file
denoised = eb.importFasta(dataDir + '06_denoised.fa')

#Import Otu header:[tax probabilities, taxonomy] dictionary from SINTAX output
taxDict = eb.importSintax(dataDir + '07_denoised.sintax')

#Import hits from 97% fast clustering
hits = eb.importClusterFast(dataDir + '09_otu_clusters.uc')

In [19]:
#FIRST TRY TO SEE HOW MUCH SHEWANELLA COLLAPSES BY TAXONOMY
i = 0
for t in taxDict:
    if i == 0:
        print(t)
        print(taxDict[t])
    i += 1

Otu6;uniq=OM8s21_7
['k:Bacteria(1.0000),p:Fusobacteria(1.0000),c:Fusobacteriia(1.0000),o:Fusobacteriales(1.0000),f:Fusobacteriaceae(1.0000),g:Fusobacterium(1.0000),s:nucleatum_subsp._polymorphum(0.7700)', 'k:Bacteria,p:Fusobacteria,c:Fusobacteriia,o:Fusobacteriales,f:Fusobacteriaceae,g:Fusobacterium']


In [20]:
#Check number of shewanella seqs in 97% otu clusters
i = 0
j = 0
shewSeqs = []
for h in hits:
    if j == 0:
        print(h)
    j += 1
    seqID = h.split(' ')[0]
    if 'oneidensis' in taxDict[seqID][1]:
        i += 1
print(i)

Otu111;uniq=OM8s08_7689 HWI-M04407:1:2105:24185:5342#TCTGTATG/1 orig_bc=TCTGTATG new_bc=TCTGTATG bc_diffs=0 droplet_bc=TTTGCCTTGAGCAGAGAACA;size=1956;
6


In [22]:
shewReads = []
for read in denoised:
    if (read.header.replace('>','') in hits) and ('Shew' in taxDict[read.seq_id][1]):
        shewReads.append(read)
print(len(shewReads))
eb.exportFasta(shewReads, dataDir + 'multiple_shew_97_otus.fa')

6


In [28]:
#How many taxonomic indications map all the way to species?  Can I make a taxonomy OTU table?
total = 0
phylum = 0
clas = 0
order = 0
family = 0
genus = 0
species = 0
for t in taxDict:
    total += 1
    if 's:' in taxDict[t][1]:
        species +=1
    elif 'g:' in taxDict[t][1]:
        genus += 1
    elif 'f:' in taxDict[t][1]:
        family += 1
    elif 'o:' in taxDict[t][1]:
        order += 1
    elif 'c:' in taxDict[t][1]:
        clas += 1
    elif 'p:' in taxDict[t][1]:
        phylum += 1
print(total)
print(phylum)
print(clas)
print(order)
print(family)
print(genus)
print(species)

354
10
5
4
7
88
233


In [None]:
#For every taxonomic assignment, choose the most abundant representative sequence
#Build a pandas OTU table
taxOTUs = {}
for read in denoised:
    if 
    
    
    if (read.header.replace('>','') in hits) and ('Shew' in taxDict[read.seq_id][1]):
        shewReads.append(read)