In [1]:
import os
import sys
import subprocess
import dropletBarcoder as db

In [2]:
env = os.environ
#usearchPath = '/home/ubuntu/bin/usearch8'
pearPath = '/usr/local/bin/pear'
usearchPath = '/home/ubuntu/users/sjspence/tools/usearch9.2.64_i86linux32'
sinaPath = '/home/ubuntu/bin/sina-1.2.11/sina'
fasttreePath = '/home/ubuntu/bin/FastTree_dd'

In [None]:
#Join paired-end reads
subprocess.call([pearPath, '-f', '170214Alm_D17-2046_1_sequence.fastq', 
                 '-r', '170214Alm_D17-2046_2_sequence.fastq', '-o', '01_pear'])

In [16]:
#Break up file into pieces that usearch can use (5 million lines each)
#Run this to completion before running next section
inFile = open('01_pear.assembled.fastq', 'r')
if not os.path.exists('02_pearSplits/'):
    os.makedirs('02_pearSplits/')
i = 0
j = 1
partFile = open('02_pearSplits/pear_' + str(j) + '.fastq', 'w')
for line in inFile:
    if i >= j*5000000:
        partFile.close()
        j += 1
        partFile = open('02_pearSplits/pear_' + str(j) + '.fastq', 'w')
    partFile.write(line)
    i += 1
partFile.close()
inFile.close()
print(i)

71812424


In [23]:
#Quality filter with usearch 9 max-error rate
def qualFilter(inFile, outFile):
    subprocess.call([usearchPath, "-fastq_filter", inFile, "-fastq_minlen", '100', '-fastq_maxee_rate', '0.01',
                     "-fastqout", outFile], env=env)
for filename in os.listdir('02_pearSplits/'):
    qualFilter('02_pearSplits/' + filename, '02_pearSplits/' + filename.replace('.fastq','filt.fastq'))

In [25]:
#Join quality-filtered files back into a single file for processing
with open('02_pear_filt.fastq', 'w') as outfile:
    for fname in os.listdir('02_pearSplits/'):
        if 'filt' in fname:
            with open('02_pearSplits/' + fname, 'r') as infile:
                for line in infile:
                    outfile.write(line)
            infile.close()
outfile.close()

In [5]:
#Import and edit mapping file
sampIDs = []
mapping = {}
readCounts = {}
with open('OM8_map.txt', 'r') as inFile:
    for line in inFile:
        if '#' not in line:
            line = line.strip().split('\t')
            mapping[line[1]] = line[0].replace('_','s')
            readCounts[line[1]] = 0
            sampIDs.append(line[0].replace('_','s'))
inFile.close()

In [38]:
#Demultiplex: check for barcodes and relabel sequences
#Use mapping file to keep barcoded sequences, prepare fasta file
with open('02_pear_filt.fastq', 'r') as inFile:
    with open('03_pear_filt.fasta', 'w') as outFile:
        i = 0
        j = 0
        nextSeq = False
        for line in inFile:
            if nextSeq:
                outFile.write(line)
                nextSeq = False
            if i%4 == 0:
                for bc in mapping:
                    if bc in line:
                        readCounts[bc] += 1
                        newLine = line.strip().replace('@','>' + mapping[bc] + '_' + str(j) + ' ')
                        newLine = newLine + ' orig_bc=' + bc + ' new_bc=' + bc + ' bc_diffs=0\n'
                        outFile.write(newLine)
                        nextSeq = True
                        j += 1
            i += 1
inFile.close()
outFile.close()
#Summarize read mapping after quality filtering and zero-error barcode matching
total = 0
summaryFile = open('03_quality_summary.txt', 'w')
for s in sampIDs:
    for bc in mapping:
        if mapping[bc] == s:
            summaryFile.write(s + '\t' + str(readCounts[bc]) + '\n')
            total += readCounts[bc]
summaryFile.write('Total\t' + str(total))
summaryFile.close()

In [3]:
#Primer check and removal, placing droplet barcode into header
#NOTE: this takes a while
qualReads = db.importFasta('03_pear_filt.fasta')
noPrimerReads = db.filtBarcodePrimers(qualReads, 20, 'GATCATGACCCATTTGGAGAAGATG', 'GGACTACHVGGGTWTCTAAT')
db.exportFasta(noPrimerReads, '04_pear_noPrimers.fasta')
print(len(qualReads))
print(len(noPrimerReads))
print(noPrimerReads[0].header)
print(noPrimerReads[0].seq)

In [5]:
#Collapse identical reads and maintain the provenance to reduce the uclust file size
#uniqueDict maps a unique sequence to a list of read objects which contain it
#NOTE: takes a long time, but rerun after notebook closes out
noPrimerReads = db.importFasta('04_pear_noPrimers.fasta')
uniqueDict = db.getUniqueSeqs(noPrimerReads, '05_unique_seqs.fasta')

In [3]:
#Use the usearch unoise algorithm to create zero radius OTUs (zOTUs), while also discarding chimeras, phiX sequences,
#and low complexity DNA
subprocess.call([usearchPath, '-unoise2', '05_unique_seqs.fasta', '-fastaout', '06_denoised.fa',
                 '-otudbout', '06_db.fa', '-minampsize', '1'], env=env)

0

**Unoise output**  


In [6]:
#Create a dictionary mapping OTU ids to all the headers from unique sequences that map within the OTU
otuToHeaders = db.uniqueSeqsToOTU('06_combined_otus.up')
print(len(otuToHeaders.keys()))
print(otuToHeaders['OTU2'][10])

4446
OM8s01_171 HWI-M04407:1:2105:25283:4847#ACTTATTG/1 orig_bc=ACTTATTG new_bc=ACTTATTG bc_diffs=0 droplet_bc=ATACTCCGTAATGCGAGATC;size=42513;


In [10]:
#Write out a file that assigns all non-chimeric read headers to an OTU in a tab-delimited file
unique = db.importFasta('05_unique_seqs.fasta')
headerDict = {}
for u in unique:
    headerDict[u.header.replace('>', '')] = u.seq
with open('07_all_seq_otus.txt', 'w') as f:
    for otu in otuToHeaders:                  #otus mapped to unique seq headers
        for header in otuToHeaders[otu]:      #list of unique seq headers for each otu
            seq = headerDict[header]          #get sequence associated with unique seq header
            for read in uniqueDict[seq]:      #list of reads associated with a unique sequence
                f.write(read.header + '\t' + otu + '\n')
f.close()

**Assign taxonomy with method=mothur on qiime**

* Copy to qiime node, run assign taxonomy with a kmer-based algorithm (RDP or mothur naive Bayes)
* RDP needed a taxonomy unavailable from silva, so tried the mothur implementation.

$ assign_taxonomy.py -i 06_combined_otus.fa -o 06_taxonomy -m mothur -r HOMD_16S_rRNA_RefSeq_V14.5.p9_spike.fasta -t HOMD_16S_rRNA_RefSeq_V14.5.mothur_spike.taxonomy

In [3]:
#Within each sample, group by barcode; quantify unique barcode pairings
#Need a structure like [sampID1, sampID2, etc.]
#Then each sampID maps to a dictionary of droplet barcodes:[otu1, otu2]
barcodeSamples = {}
with open('07_all_seq_otus.txt', 'r') as f:
    for line in f:
        line = line.strip().split('\t')
        samp = line[0].split('_')[0].replace('>','')
        bc = line[0].split('droplet_bc=')[1]
        otu = line[1]
        if samp not in barcodeSamples:
            barcodeSamples[samp] = {bc:[otu]}
        else:
            if bc not in barcodeSamples[samp]:
                barcodeSamples[samp][bc] = [otu]
            else:
                barcodeSamples[samp][bc].append(otu)

In [6]:
#Quantify richness of droplet barcoding
for s in sampIDs:
    if s in barcodeSamples:
        singletons = 0
        amplicons = 0
        multiples = 0
        for bc in barcodeSamples[s]:
            if len(barcodeSamples[s][bc]) == 1:
                singletons += 1
            else:
                uniqueOTUs = set(barcodeSamples[s][bc])
                if len(uniqueOTUs) == 1:
                    amplicons += 1
                else:
                    multiples += 1
        print(s + ':\t' + str(singletons) + '\t' + str(amplicons) + '\t' + str(multiples))

OM8s01:	14225	14089	11449
OM8s02:	61269	18427	9144
OM8s03:	55942	14543	6040
OM8s04:	65835	14171	10344
OM8s05:	36774	10579	9786
OM8s06:	32450	6674	6888
OM8s07:	48448	5530	636
OM8s08:	7369	7106	4108
OM8s09:	17597	13735	2518
OM8s10:	23643	2126	525
OM8s11:	57654	5574	1741
OM8s12:	33001	3152	1063
OM8s13:	23704	4304	509
OM8s14:	20613	19942	656
OM8s15:	3941	2118	374
OM8s16:	8152	4049	587
OM8s17:	58941	19035	1355
OM8s18:	23029	8694	523
OM8s19:	38	7	2
OM8s20:	167	13	5
OM8s21:	160342	68810	42769
OM8s22:	71633	10378	10808
OM8s23:	73156	19359	16699
OM8s24:	50394	11590	11639
OM8s25:	127815	57944	7346
OM8s26:	76834	27011	9098
OM8s27:	6	3	0
OM8s28:	16956	13859	2313
OM8s29:	92	78	2
OM8s30:	44	34	0
OM8s31:	5	5	0
OM8s32:	2	0	0
OM8s34:	2	0	0


In [9]:
#Quantify unique pairings
pairDicts = {}
for s in sampIDs:
    if s in barcodeSamples:
        uniquePairs = {}
        for bc in barcodeSamples[s]:
            if len(barcodeSamples[s][bc]) != 1:
                uniqueOTUs = set(barcodeSamples[s][bc])
                if len(uniqueOTUs) != 1:
                    pairString = '_'.join(list(uniqueOTUs))
                    if pairString not in uniquePairs:
                        uniquePairs[pairString] = 1
                    else:
                        uniquePairs[pairString] += 1
        pairDicts[s] = uniquePairs

In [13]:
#Import taxonomic information
taxDict = db.importTaxonomy('06_taxonomy/06_combined_otus_tax_assignments.txt', 'mothur')

In [21]:
for p in pairDicts['OM8s14']:
    controlPair = False
    allOTUs = p.split('_')
    taxP = []
    for o in allOTUs:
        if ('coli' in taxDict[o][0]) or ('oneidensis' in taxDict[o][0]) or ('subtilis' in taxDict[o][0]):
            taxonomy = taxDict[o][0]
            genus = taxonomy.split(';')[5]
            species = taxonomy.split(';')[6]
            taxP.append(genus + ';' + species)
            controlPair = True
    if controlPair:
        taxP = '_'.join(taxP)
        print(taxP + ': ' + str(pairDicts['OM8s14'][p]))

Escherichia;coli_Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 2
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 2
Escherichia;coli_Shewanella;oneidensis: 83
Shewanella;oneidensis_Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis: 70
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Shewanella;oneidensis: 1
Shewanella;oneidensis_Escherichia;coli_Shewanella;on

In [7]:
#OTU1 is s. onei (#OTU2009, OTU2008)
#OTU60 is b. sub...is it anywhere?
#OTU1512 is definitely e. coli
#OTU2009, OTU1865, OTU3394, OTU2732, OTU989 may be e. coli

1

In [25]:
for p in pairDicts['OM8s01']:
    print(p + ': ' + str(pairDicts['OM8s01'][p]))

OTU1300_OTU5_OTU2_OTU671_OTU1_OTU10_OTU45: 1
OTU14_OTU2_OTU8: 2
OTU5_OTU2_OTU1_OTU73_OTU70_OTU10_OTU11: 1
OTU70_OTU5_OTU8_OTU381: 1
OTU14_OTU2_OTU1: 14
OTU24_OTU5_OTU2_OTU1_OTU70_OTU381: 1
OTU70_OTU5_OTU1940_OTU381_OTU1: 1
OTU5_OTU2_OTU11_OTU1: 2
OTU2_OTU984_OTU5_OTU10_OTU1: 1
OTU14_OTU2_OTU10: 21
OTU38_OTU126_OTU5_OTU2_OTU1_OTU73_OTU10: 1
OTU5_OTU38_OTU1: 1
OTU2_OTU16_OTU10_OTU1: 8
OTU14_OTU2_OTU17_OTU10_OTU671: 1
OTU126_OTU5_OTU2_OTU33: 1
OTU24_OTU5_OTU29_OTU1: 1
OTU392_OTU2_OTU11: 1
OTU126_OTU10_OTU1: 1
OTU38_OTU56_OTU36_OTU126_OTU5_OTU123_OTU70_OTU1_OTU14_OTU73_OTU16_OTU17_OTU10_OTU11_OTU2_OTU45_OTU19_OTU6: 1
OTU58_OTU70_OTU5_OTU2_OTU1: 1
OTU14_OTU73_OTU16: 2
OTU14_OTU73_OTU10: 22
OTU2_OTU2245: 1
OTU5_OTU70_OTU17_OTU2_OTU381: 1
OTU5_OTU2_OTU33_OTU1_OTU17_OTU10: 1
OTU34_OTU2_OTU16: 1
OTU2_OTU11: 13
OTU73_OTU10: 7
OTU984_OTU70_OTU5_OTU1940_OTU381: 1
OTU5_OTU123_OTU28: 1
OTU126_OTU4_OTU5_OTU2: 1
OTU2_OTU63_OTU5_OTU10_OTU1: 2
OTU5_OTU36_OTU722_OTU2_OTU1_OTU70_OTU10_OTU56: 1
OTU45_OTU10