In [1]:
import os
import sys
import subprocess
import epicBarcoder as eb
from itertools import combinations
from scipy.stats import poisson

In [2]:
env = os.environ
dataDir = '/home/ubuntu/users/sjspence/170214_OM8/03_jupyter/'
pearPath = '/usr/local/bin/pear'
#usearchPath = '/home/ubuntu/bin/usearch8'
usearchPath = '/home/ubuntu/users/sjspence/tools/usearch9.2.64_i86linux32'
sinaPath = '/home/ubuntu/bin/sina-1.2.11/sina'
fasttreePath = '/home/ubuntu/bin/FastTree_dd'

In [None]:
#Join paired-end reads
subprocess.call([pearPath, '-f', dataDir + '170214Alm_D17-2046_1_sequence.fastq', 
                 '-r', dataDir + '170214Alm_D17-2046_2_sequence.fastq', '-o', dataDir + '01_pear'])

In [16]:
#Break up file into pieces that usearch can use (5 million lines each)
#Run this to completion before running next section
inFile = open(dataDir + '01_pear.assembled.fastq', 'r')
if not os.path.exists(dataDir + '02_pearSplits/'):
    os.makedirs(dataDir + '02_pearSplits/')
i = 0
j = 1
partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
for line in inFile:
    if i >= j*5000000:
        partFile.close()
        j += 1
        partFile = open(dataDir + '02_pearSplits/pear_' + str(j) + '.fastq', 'w')
    partFile.write(line)
    i += 1
partFile.close()
inFile.close()
print(i)

71812424


In [23]:
#Quality filter with usearch 9 max-error rate
def qualFilter(inFile, outFile):
    subprocess.call([usearchPath, "-fastq_filter", inFile, "-fastq_minlen", '100', '-fastq_maxee_rate', '0.01',
                     "-fastqout", outFile], env=env)
for filename in os.listdir(dataDir + '02_pearSplits/'):
    qualFilter(dataDir + '02_pearSplits/' + filename, dataDir + '02_pearSplits/' + filename.replace('.fastq','filt.fastq'))

In [25]:
#Join quality-filtered files back into a single file for processing
with open(dataDir + '02_pear_filt.fastq', 'w') as outfile:
    for fname in os.listdir(dataDir + '02_pearSplits/'):
        if 'filt' in fname:
            with open(dataDir + '02_pearSplits/' + fname, 'r') as infile:
                for line in infile:
                    outfile.write(line)
            infile.close()
outfile.close()

In [6]:
#Import and edit mapping file
sampIDs = []
mapping = {}
readCounts = {}
with open(dataDir + 'OM8_map.txt', 'r') as inFile:
    for line in inFile:
        if '#' not in line:
            line = line.strip().split('\t')
            mapping[line[1]] = line[0].replace('_','s')
            readCounts[line[1]] = 0
            sampIDs.append(line[0].replace('_','s'))
inFile.close()

In [38]:
#Demultiplex: check for barcodes and relabel sequences
#Use mapping file to keep barcoded sequences, prepare fasta file
with open(dataDir + '02_pear_filt.fastq', 'r') as inFile:
    with open(dataDir + '03_pear_filt.fasta', 'w') as outFile:
        i = 0
        j = 0
        nextSeq = False
        for line in inFile:
            if nextSeq:
                outFile.write(line)
                nextSeq = False
            if i%4 == 0:
                for bc in mapping:
                    if bc in line:
                        readCounts[bc] += 1
                        newLine = line.strip().replace('@','>' + mapping[bc] + '_' + str(j) + ' ')
                        newLine = newLine + ' orig_bc=' + bc + ' new_bc=' + bc + ' bc_diffs=0\n'
                        outFile.write(newLine)
                        nextSeq = True
                        j += 1
            i += 1
inFile.close()
outFile.close()
#Summarize read mapping after quality filtering and zero-error barcode matching
total = 0
summaryFile = open(dataDir + '03_quality_summary.txt', 'w')
for s in sampIDs:
    for bc in mapping:
        if mapping[bc] == s:
            summaryFile.write(s + '\t' + str(readCounts[bc]) + '\n')
            total += readCounts[bc]
summaryFile.write('Total\t' + str(total))
summaryFile.close()

In [3]:
#Primer check and removal, placing droplet barcode into header
#NOTE: this takes a while
qualReads = eb.importFasta(dataDir + '03_pear_filt.fasta')
noPrimerReads = eb.filtBarcodePrimers(qualReads, 20, 'GATCATGACCCATTTGGAGAAGATG', 'GGACTACHVGGGTWTCTAAT')
eb.exportFasta(noPrimerReads, dataDir + '04_pear_noPrimers.fasta')
print(len(qualReads))
print(len(noPrimerReads))
print(noPrimerReads[0].header)
print(noPrimerReads[0].seq)

In [3]:
#Collapse identical reads and maintain the provenance to reduce the uclust file size
#uniqueDict maps a unique sequence to a list of read objects which contain it
#NOTE: takes a long time, but rerun after notebook closes out
noPrimerReads = eb.importFasta(dataDir + '04_pear_noPrimers.fasta')
uniqueDict = eb.getUniqueSeqs(noPrimerReads, dataDir + '05_unique_seqs.fasta')

In [3]:
#Use the usearch unoise algorithm to create zero radius OTUs (zOTUs), while also discarding chimeras, phiX sequences,
#and low complexity DNA
#Input: unique sequences collapsed from quality- and primer- filtered data
#Output: Denoised file with true biological reads
#        Database file with true amplicon reads including chimeras
subprocess.call([usearchPath, '-unoise2', dataDir + '05_unique_seqs.fasta', '-fastaout', dataDir + '06_denoised.fa',
                 '-otudbout', dataDir + '06_db.fa', '-minampsize', '1'], env=env)

0

**Unoise output**  
00:02 332Mb   100.0% Reading 05_unique_seqs.fasta  
01:04 637Mb   100.0% 27163 amplicons, 2096896 bad (size >= 1)  
46:06 650Mb   100.0% 14693 good, 12470 chimeras

245334 corrected amplicon sequences (including chimeras) in 06_db.fa  
14693 output biological sequences in 06_denoised.fa

In [7]:
#Format fasta database for input to SINTAX
#>AB008314;tax=d:Bacteria,p:Firmicutes,c:Bacilli,o:Lactobacillales,f:Streptococcaceae,g:Streptococcus;
#Maintained HOMD HOT strain ID in header following the taxonomic information
outFile = open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.fasta', 'w')
taxDict = {}
with open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.qiime_spike.taxonomy', 'r') as t:
    for line in t:
        line = line.strip().split('\t')
        taxID = line[0]
        tax = line[1].strip().replace('__',':')
        tax = tax.replace(';',',')
        taxDict[taxID] = tax
with open(dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_spike.fasta', 'r') as f:
    for line in f:
        if '>' in line:
            line = line.strip().split(' ')
            taxInfo = taxDict[line[0].replace('>','')]
            outLine = line[0] + ';tax=' + taxInfo + ';'
            for i in line:
                if 'HOT' in i:
                    outLine += i + ';'
            outFile.write(outLine + '\n')
        else:
            outFile.write(line)
outFile.close()
subprocess.call([usearchPath, '-makeudb_sintax', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.fasta', 
                 '-output', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.udb'], env=env)

0

**Database formatting output**  
00:00 14Mb   1020 names, tax levels min 7, avg 7.0, max 7  
WARNING: 25 taxonomy nodes have >1 parent  
00:00 14Mb   Buffers (892 seqs)

In [8]:
#Run SINTAX to determine denoised read taxonomic information
#Default is to run one thread per CPU core, or 10 threads if there are > 10 cores
subprocess.call([usearchPath, '-sintax', dataDir + '06_denoised.fa', 
                 '-db', dataDir + 'HOMD_16S_rRNA_RefSeq_V14.5.p9_sintax_spike.udb', 
                 '-tabbedout', dataDir + '07_denoised.sintax', 
                 '-strand', 'plus', '-sintax_cutoff', '0.8', '-threads', '4'], env=env)

0

In [4]:
#Combine taxonomic information to export final file with droplet barcodes and taxonomies
#06_denoised.fa: header matches the first tabbed column of sintax output (minus '>'), sequence follows
#Import list of read objects from unoise2 denoised file
denoised = eb.importFasta(dataDir + '06_denoised.fa')

#Import Otu header:[tax probabilities, taxonomy] dictionary from SINTAX output
taxDict = eb.importSintax(dataDir + '07_denoised.sintax')

#Take denoised zOTUs and taxonomic information, then map back to original reads and rewrite original read file with
#zOTU and taxonomic information in the headers
eb.otuToHeaders(denoised, taxDict, uniqueDict, dataDir + '08_all_seqs_tax.fa')

**Read loss from unoise2**

24490106 04_pear_noPrimers.fasta = 12,245,053 reads prior to unoise2  
18257786 08_allSeqsTax.fa = 9,128,893 reads after unoise2  

Approximately 25% read loss.

In [3]:
#Within each sample, group by barcode; quantify unique barcode pairings
#Input: Fasta file with droplet barcode, otu, and taxonomic information in the header
#Output: A dictionary where each sampID maps to a dictionary of droplet barcodes:[otu1, otu2]
#        A dictionary where each OTU maps to a >80% taxonomy
def createBarcodeDict(inFileName):
    inFile = open(inFileName, 'r')
    barcodeSamples = {}
    taxDict = {}
    for line in inFile:
        if '>' in line:
            line = line.strip().split(';')
            samp = line[0].split('_')[0].replace('>','')
            bc = line[0].split('droplet_bc=')[1]
            otu = line[1]
            tax = line[2].replace('tax=','')
            if samp not in barcodeSamples:
                barcodeSamples[samp] = {bc:[otu]}
            else:
                if bc not in barcodeSamples[samp]:
                    barcodeSamples[samp][bc] = [otu]
                else:
                    barcodeSamples[samp][bc].append(otu)
            if otu not in taxDict:
                taxDict[otu] = tax
    inFile.close()
    return barcodeSamples, taxDict

In [10]:
barcodeDict, taxDict = createBarcodeDict(dataDir + '08_all_seqs_tax.fa')

In [11]:
#Print number of barcodes per sample
for s in sampIDs:
    if s in barcodeDict:
        sample = barcodeDict[s]
        print(s + '\t' + str(len(sample)))

OM8s01	31613
OM8s02	53727
OM8s03	39763
OM8s04	56228
OM8s05	42798
OM8s06	32556
OM8s07	41151
OM8s08	15048
OM8s09	26675
OM8s10	19429
OM8s11	49765
OM8s12	25560
OM8s13	24283
OM8s14	37869
OM8s15	5625
OM8s16	11355
OM8s17	69740
OM8s18	28834
OM8s19	40
OM8s20	83
OM8s21	221776
OM8s22	71263
OM8s23	85362
OM8s24	58431
OM8s25	145900
OM8s26	82931
OM8s27	7
OM8s28	29651
OM8s29	146
OM8s30	68
OM8s31	9
OM8s32	1
OM8s34	2


In [12]:
#Quantify unique pairings
pairDicts = {}
for s in sampIDs:
    if s in barcodeDict:
        uniquePairs = {}
        for bc in barcodeDict[s]:
            if len(barcodeDict[s][bc]) != 1:
                uniqueOTUs = set(barcodeDict[s][bc])
                if len(uniqueOTUs) != 1:
                    pairString = '_'.join(list(uniqueOTUs))
                    if pairString not in uniquePairs:
                        uniquePairs[pairString] = 1
                    else:
                        uniquePairs[pairString] += 1
        pairDicts[s] = uniquePairs

In [17]:
#Calculate abundances of background OTUs based on singleton barcodes
abundances = {}
totals = {}
for s in sampIDs:
    if s not in barcodeDict:
        continue
    total = 0
    backgroundOTU = {}
    for bc in barcodeDict[s]:
        if len(barcodeDict[s][bc]) == 1:
            otu = barcodeDict[s][bc][0]
            if otu not in backgroundOTU:
                backgroundOTU[otu] = 1
            else:
                backgroundOTU[otu] += 1
            total += 1
    abundances[s] = backgroundOTU
    totals[s] = total

In [20]:
#Singleton counts for different samples
for s in sampIDs:
    if s in totals:
        print(s + ': ' + str(totals[s]))

OM8s01: 11533
OM8s02: 39422
OM8s03: 30791
OM8s04: 42288
OM8s05: 27164
OM8s06: 22181
OM8s07: 36436
OM8s08: 5793
OM8s09: 13520
OM8s10: 17338
OM8s11: 43941
OM8s12: 22391
OM8s13: 19966
OM8s14: 18805
OM8s15: 3276
OM8s16: 6924
OM8s17: 51150
OM8s18: 20174
OM8s19: 32
OM8s20: 73
OM8s21: 128199
OM8s22: 56484
OM8s23: 59881
OM8s24: 40608
OM8s25: 95972
OM8s26: 57840
OM8s27: 6
OM8s28: 14600
OM8s29: 75
OM8s30: 39
OM8s31: 5
OM8s32: 1
OM8s34: 2


In [22]:
#Convert abundance counts to relative abundances
relAbundances = {}
for s in sampIDs:
    if s in abundances:
        relAbund = {}
        for otu in abundances[s]:
            relAbund[otu] = float(abundances[s][otu]) / totals[s]
        relAbundances[s] = relAbund

In [36]:
#Create dictionary mapping unique otu pairs to number of barcodes supporting it
pairDict = {}
for s in sampIDs:
    if s not in barcodeDict:
        continue
    pairs = {}
    for bc in barcodeDict[s]:
        otuList = barcodeDict[s][bc]
        if len(otuList) > 1:
            uniqueOTUs = list(set(otuList))
            if len(uniqueOTUs) > 1:
                pairList = ['_'.join(list(comb)) for comb in combinations(uniqueOTUs, 2)]
                for p in pairList:
                    if p not in pairs:
                        pairs[p] = 1
                    else:
                        pairs[p] += 1
    pairDict[s] = pairs
#    for otu in relAbundances[s]:
#        relAbundances[s][otu]

In [54]:
#Calculate poisson probabilities that two bugs would co-occur and filter results based on that
for s in sampIDs:
    if s not in pairDict:
        continue
    i = 0
    t = 0
    shew = 0
    doubleShew = 0
    for otuPair in pairDict[s]:
        t += 1
        otu1 = otuPair.split('_')[0]
        otu2 = otuPair.split('_')[1]
        if otu1 in relAbundances[s]:
            a1 = relAbundances[s][otu1]
        else:
            a1 = 0.0
        if otu2 in relAbundances[s]:
            a2 = relAbundances[s][otu2]
        else:
            a2 = 0.0
        x = pairDict[s][otuPair]
        mu = a1 * a2 * totals[s]
        p = poisson.pmf(x, mu)
        if p < 0.000000000000001:
            i += 1
            if ('oneidensis' in taxDict[otu1]) or ('oneidensis' in taxDict[otu2]):
                shew += 1
                if ('oneidensis' in taxDict[otu1]) and ('oneidensis' in taxDict[otu2]):
                    doubleShew += 1
    print(s + '\t' + str(t) + '\t' + str(i) + '\t' + str(shew) + '\t' + str(doubleShew))

OM8s01	2478	150	39	2
OM8s02	1867	72	39	1
OM8s03	1777	51	24	2
OM8s04	3208	93	30	2
OM8s05	5745	194	48	2
OM8s06	6674	204	48	4
OM8s07	31818	119	71	1
OM8s08	5296	139	31	1
OM8s09	948	26	0	0
OM8s10	1000	34	20	1
OM8s11	2948	123	33	1
OM8s12	3210	46	14	0
OM8s13	341	6	6	4
OM8s14	117	6	6	6
OM8s15	84	0	0	0
OM8s16	192	6	6	6
OM8s17	292	11	11	9
OM8s18	211	4	4	4
OM8s19	0	0	0	0
OM8s20	1	0	0	0
OM8s21	26904	1111	204	10
OM8s22	1304	53	15	2
OM8s23	3428	80	27	4
OM8s24	4865	97	34	2
OM8s25	2588	68	6	1
OM8s26	1959	136	9	1
OM8s27	0	0	0	0
OM8s28	146	5	3	1
OM8s29	5	0	0	0
OM8s30	0	0	0	0
OM8s31	0	0	0	0
OM8s32	0	0	0	0
OM8s34	0	0	0	0


In [56]:
#What are the s. oneidensis relative abundances?
for s in sampIDs:
    print(s)
    for otu in relAbundances[s]:
        if 'oneidensis' in taxDict[otu]:
            print(otu + '\t' + str(relAbundances[s][otu]))
    print('\n')

OM8s01
Otu3734	8.67077083153e-05
Otu3	0.0461285008237
Otu73	0.000346830833261
Otu3519	8.67077083153e-05
Otu10925	8.67077083153e-05
Otu55	0.000433538541576
Otu8972	8.67077083153e-05
Otu1	0.0632966270701
Otu183	8.67077083153e-05


OM8s02
Otu13966	2.53665465983e-05
Otu6146	2.53665465983e-05
Otu55	0.00210542336766
Otu12420	2.53665465983e-05
Otu4467	2.53665465983e-05
Otu10397	2.53665465983e-05
Otu12588	2.53665465983e-05
Otu11514	2.53665465983e-05
Otu11425	2.53665465983e-05
Otu73	0.00119222769012
Otu12669	2.53665465983e-05
Otu3	0.172238851403
Otu1	0.241514890163
Otu11056	2.53665465983e-05
Otu4984	2.53665465983e-05
Otu208	5.07330931967e-05
Otu1842	2.53665465983e-05
Otu10479	2.53665465983e-05
Otu2384	2.53665465983e-05
Otu5102	2.53665465983e-05
Otu8132	2.53665465983e-05
Otu3952	2.53665465983e-05
Otu241	7.6099639795e-05
Otu8761	2.53665465983e-05
Otu217	2.53665465983e-05
Otu12485	2.53665465983e-05
Otu183	0.000202932372787


OM8s03
Otu3434	3.24770225066e-05
Otu3506	3.24770225066e-05
Otu55	0.000974

KeyError: 'OM8s33'

NOTES: OK shewanella was divided into a bunch of OTUs, so probably makes sense to combine them for analysis.  This will likely increase the rel. abundance of S. oneidensis and reduce the significant connections in our poisson model.

In [7]:
#Try cluster_fast clustering
subprocess.call([usearchPath, '-cluster_fast', dataDir + '06_denoised.fa', '-id', '0.97', '-centroids', 
                dataDir + '09_otu_clusters.fa', '-uc', dataDir + '09_otu_clusters.uc'], env=env)

0

      Seqs  14693 (14.7k)  
  Clusters  8467  
  Max size  445  
  Avg size  1.7  
  Min size  1  
Singletons  6834, 46.5% of seqs, 80.7% of clusters  
   Max mem  108Mb  
      Time  1.00s  
Throughput  14.7k seqs/sec.  

In [3]:
#Import list of read objects from unoise2 denoised file
denoised = eb.importFasta(dataDir + '06_denoised.fa')

#Import Otu header:[tax probabilities, taxonomy] dictionary from SINTAX output
taxDict = eb.importSintax(dataDir + '07_denoised.sintax')

#Import hits from 97% fast clustering
hits = eb.importClusterFast(dataDir + '09_otu_clusters.uc')

In [6]:
#FIRST TRY TO SEE HOW MUCH SHEWANELLA COLLAPSES BY TAXONOMY
i = 0
for t in taxDict:
    if i == 0:
        print(t)
        print(taxDict[t])
    i += 1

Otu1981;uniq=OM8s04_11466040
['k:Bacteria(1.0000),p:Proteobacteria(0.9100),c:Epsilonproteobacteria(0.9100),o:Campylobacterales(0.9100),f:Campylobacteraceae(0.9100),g:Campylobacter(0.9100),s:gracilis(0.7700)', 'k:Bacteria,p:Proteobacteria,c:Epsilonproteobacteria,o:Campylobacterales,f:Campylobacteraceae,g:Campylobacter']


In [12]:
#Check number of shewanella seqs in 97% otu clusters
i = 0
j = 0
shewSeqs = []
for h in hits:
    if j == 0:
        print(h)
    j += 1
    seqID = h.split(' ')[0]
    if 'Shew' in taxDict[seqID][1]:
        i += 1
print(i)

Otu5546;uniq=OM8s21_4041465 HWI-M04407:1:1103:3790:9494#GATTGAAG/1 orig_bc=GATTGAAG new_bc=GATTGAAG bc_diffs=0 droplet_bc=TAGTCACCTTATGTATTCTA;size=1;
731


In [20]:
shewReads = []
for read in denoised:
    if (read.header.replace('>','') in hits) and ('Shew' in taxDict[read.seq_id][1]):
        shewReads.append(read)
print(len(shewReads))
eb.exportFasta(shewReads, 'wtf.fa')

731
