In [2]:
import os
import sys
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio import Phylo
import numpy as np
import pandas as pd
import seaborn as sns
import pysam
import math
from Bio import Entrez
import pickle
mpl.rcParams['pdf.fonttype'] = 42
plt.rcParams['svg.fonttype'] = 'none'
%matplotlib inline

In [3]:
####
#USER INPUT
unmappedDir = '12_bwa_unmapped/'
blastDir = '13_bwa_blast/'
prokkaDir = '09_prokka_biopython/'
annoDir = '14_bwa_annotations'
subDir = 'subA/'
mappedID = 'D17-102065'

In [4]:
def checkSlash(directory):
    if directory[-1] != '/':
        directory = directory + '/'
    return directory

unmappedDir = checkSlash(unmappedDir)
blastDir = checkSlash(blastDir)
prokkaDir = checkSlash(prokkaDir)
annoDir = checkSlash(annoDir)
subDir = checkSlash(subDir)
gbDir = blastDir + 'gbRefs/'
if not os.path.exists(blastDir):
    os.makedirs(blastDir)
if not os.path.exists(annoDir):
    os.makedirs(annoDir)
if not os.path.exists(gbDir):
    os.makedirs(gbDir)

In [4]:
#Returns pandas dataframe of variable read mappings for a single sample of source reads
#Input = mappedID: source reads for mapping are from this sample
#        unmapped: directory of unmapped results
#Columns = samples in subgroup, plus 'sequence_1' and 'sequence_2' for the fwd & rev sequences of the read
#Rows = Read IDs
#Data = one if even one read mapped, zero if both paired ends are unmapped
def binaryMapping(mappedID, unmapped):
    unmappedDict = {}
    unmappedSeqs = {}
    for f in os.listdir(unmapped):
        if 'mapped_' + mappedID in f:
            contig = f.split('_')[1]
            unmappedDict[contig] = {}
            inFile = open(unmapped + f, 'r')
            for line in inFile:
                line = line.strip().split('\t')
                readID = line[0]
                flag = int(line[1])
                seq = line[9]
                if (flag & 64) and (flag & 4):
                    unmappedSeqs[readID + '__1'] = seq
                    unmappedDict[contig][readID + '__1'] = 10
                elif (flag & 128) and (flag & 4):
                    unmappedSeqs[readID + '__2'] = seq
                    unmappedDict[contig][readID + '__2'] = 10
            inFile.close()
    df = pd.DataFrame.from_dict(unmappedDict)
    df = df.fillna(value=1)             #1 if mapped
    df = df.replace(10, 0)              #0 if unmapped
    ####
    #Remove reads that are unmapped (e.g. zeros) in all samples
    df = df.loc[(df != 0).any(axis = 1)]
    #Remove reads that are mapped (e.g. ones) in all samples
    df = df.loc[(df.sum(axis=1) != len(df.columns))]
    df['sequence'] = ''
    for i in list(df.index):
        df['sequence'][i] = unmappedSeqs[i]
    return df

In [39]:
####
#For each mapped sample in a subgroup,
#import dataframe with reads as indexes and samples as columns.
#1's indicate that at least one PE read mapped, 0's mean both paired ends were unmapped
outSeqDict = {}
for f in os.listdir(unmappedDir + subDir):
    if 'mapped_' + mappedID in f:
        mappedSamp = f.split('_')[1]
    else:
        continue
    binaryFile = blastDir + subDir + 'mapped_' + mappedSamp + '_binary.txt'
    if os.path.exists(binaryFile):
        df = pd.DataFrame.from_csv(binaryFile, sep = '\t')
    else:
        df = binaryMapping(mappedSamp, unmappedDir + subDir)
        df.to_csv(binaryFile, sep = '\t')
    for i in list(df.index):
        if i not in outSeqDict:
            outSeqDict[i] = df['sequence'][i]

####
#Export reads of interest for BLAST analysis
outFile = open(blastDir + subDir + 'differentially_mapped_reads.fa', 'w')
for i in outSeqDict:
    outFile.write('>' + i + '\n')
    outFile.write(outSeqDict[i] + '\n')
outFile.close()

# Read matches to nt database

$ update_blastdb.pl --passive --timeout 300 --force --verbose nr &> nr.updatedb.log

In [34]:
def runBlast(inFile, threads):
    if not os.path.exists(inFile.replace('.fa', '.out')):
        os.system('blastn -task blastn -db nt_blast/nt -query ' + inFile + ' -out ' + \
                  inFile.replace('.fa', '.out') + ' -num_threads ' + str(threads) + \
                 ' -outfmt 6 -max_target_seqs 1')

def bestBlast(blastFile, outFile):
    f = open(blastFile, 'r')
    o = open(outFile, 'w')
    reads = set()
    data = {}
    for line in f:
        lineList = line.strip().split('\t')
        readID = lineList[0]
        if readID in reads: #Keep only top BLAST hit
            continue
        reads.add(readID)
        o.write(line)
    f.close()
    o.close()

#Write file to download genbank references
#Check for existing genbank files in a shared directory and skip downloading those
def getAccessions(inFile, gbDir, outFile):
    gbDatabase = set()
    for i in os.listdir(gbDir):
        gbDatabase.add(i.replace('.gb', ''))
    f = open(inFile, 'r')
    giSet = set()
    for line in f:
        line = line.strip().split('\t')
        gi = line[1].split('|')[1]
        gb = line[1].split('|')[3].split('.')[0]
        if gb not in gbDatabase:
            giSet.add(gi)
    f.close()
    o = open(outFile, 'w')
    for gi in giSet:
        o.write(gi + '\n')
    o.close()
    
def divideBlastFile(inFile, chunkDir, chunks):
    inHandle = inFile.replace('.out', '')
    f = open(inFile, 'r')
    chunks = int(chunks)
    i = 0
    j = 0
    outF = open(chunkDir + str(j) + '.out', 'w')
    for line in f:
        if (i != 0) and (i % chunks == 0):
            outF.close()
            j +=1
            outF = open(chunkDir + str(j) + '.out', 'w')
        outF.write(line)
        i += 1
    f.close()
    outF.close()

In [35]:
####
#BLAST sequences of interest against the nt database
diffFile = blastDir + subDir + 'differentially_mapped_reads.fa'
runBlast(diffFile, 30)

In [23]:
####
#Write the best BLAST matches to a file
blastFile = blastDir + subDir + 'differentially_mapped_reads.out'
outFile = blastDir + subDir + 'differentially_mapped_best.out'
bestBlast(blastFile, outFile)

In [35]:
####
#Export BLAST GI numbers to a separate file for genbank download
blastFile = blastDir + subDir + 'differentially_mapped_best.out'
accessionFile = blastDir + subDir + 'differentially_mapped_best_accessions.txt'
getAccessions(blastFile, gbDir, accessionFile)

In [26]:
####
#Download genbank files for all gi numbers recovered
#Consider running this in the background
commandString = 'python scripts/getGenbank.py -i ' + accessionFile + ' -d nuccore -e sjspen@gmail.com -o ' + gbDir
print('Run this in the background:\n')
print(commandString)
#os.system(commandString)

#NOTE: may need to check resulting files for any that have a locus different than accession/version,
#didn't know how to fix other guys code.
#Could probably fix by editing the getGenbank script right when the full string is written - grab accession from there

#$ mv HUMRSALPC.gb M28031.gb
#$ mv HUAC002038.gb AC002038.gb

Run this in the background:

python scripts/getGenbank.py -i 13_bwa_blast/subA/differentially_mapped_best_accessions.txt -d nuccore -e sjspen@gmail.com -o 13_bwa_blast/gbRefs/


0

In [29]:
####
#Divide BLAST file into parallel chunks
inFile = blastDir + subDir + 'differentially_mapped_best.out'
chunkDir = blastDir + subDir + 'blast_chunks/'
if not os.path.exists(chunkDir):
    os.makedirs(chunkDir)
chunks = 1000
divideBlastFile(inFile, chunkDir, chunks)

In [33]:
####
#Run BLAST annotations in parallel on chunked BLAST files
chunkDir = blastDir + subDir + 'blast_chunks/'
chunkDir = checkSlash(chunkDir)
batchFile = blastDir + subDir + 'annotationBatch.sh'
parallel = 16
b = open(batchFile, 'w')
b.write('#!/bin/bash\n#Annotation commands\n\n')
for f in os.listdir(chunkDir):
    f = chunkDir + f
    o = f.replace('.out', '_annotation.txt')
    command = './scripts/gbFeatures.py -i ' + f + ' -g ' + gbDir + ' -o ' + o + '\n'
    b.write(command)
b.close()
os.system('chmod +x ' + batchFile)
print('Run this command in the background:')
print('cat ' + batchFile + ' | parallel -j ' + str(parallel))
#os.system('cat ' + batchFile + ' | parallel -j ' + str(parallel))

Run this command in the background:
cat 13_bwa_blast/subA/annotationBatch.sh | parallel -j 16


In [None]:
#These are the other features that could be recovered in different files!
set(['misc_feature', 'tRNA', 'repeat_region', 'rRNA', 'exon', 'misc_RNA', 'gene'])
set(['misc_feature', 'tRNA', 'sig_peptide', 'rRNA', 'exon', 'mRNA', 'gene', 'ncRNA'])
set(['misc_feature', 'tRNA', 'repeat_region', 'rRNA', 'exon', 'mRNA', 'gene', 'ncRNA'])
set(['misc_feature', 'misc_recomb', 'tRNA', 'rRNA', 'exon', 'gene'])
set(['misc_feature', 'unsure', 'tRNA', 'mRNA', 'misc_RNA', 'gene'])
set(['misc_feature', 'tRNA', 'repeat_region', 'sig_peptide', 'rRNA', 'mRNA', 'misc_RNA', 'gene', 'ncRNA'])
set(['misc_feature', 'tRNA', 'repeat_region', 'sig_peptide', 'rRNA', 'gene', 'ncRNA'])
set(['misc_feature', 'unsure', 'tRNA', 'repeat_region', 'sig_peptide', 'rRNA', 'gene'])
set(['misc_feature', 'tRNA', 'repeat_region', 'rRNA', 'misc_RNA', 'gene', 'ncRNA'])
set(['misc_feature', 'tRNA', 'repeat_region', 'rRNA', 'exon', 'mRNA', 'STS', 'gene'])
set(['misc_feature', 'tRNA', 'repeat_region', 'sig_peptide', 'rRNA', 'assembly_gap', 'misc_RNA', 'gene', 'ncRNA'])
set(['misc_feature', 'tRNA', 'repeat_region', 'rRNA', 'mRNA', 'gene', 'ncRNA'])

In [36]:
####
#Concatenate annotation results
#Includes 'CDS' annotations and 'source' annotations, in case the source is useful (many from FRC)
annoFile = blastDir + subDir + 'differentially_mapped_best_coding.txt'
chunkDir = blastDir + subDir + 'blast_chunks/'
chunkDir = checkSlash(chunkDir)
catString = 'cat '
for f in os.listdir(chunkDir):
    if '_annotation.txt' in f:
        f = chunkDir + f
        catString += f + ' '
catString += ' > ' + annoFile
os.system(catString)

0

In [49]:
def unpackQuals(qualString):
    qualList = qualString.split('__')
    qualDict = {}
    for i in qualList:
        i = i.split('||')
        qualDict[i[0]] = i[1]
    return qualDict

def summarizeHits(binaryFile, annoFile, outFile):
    #Import annotations
    df = pd.DataFrame.from_csv(binaryFile, sep='\t')
    annoDict = {}
    inFile = open(annoFile,'r')
    for line in inFile:
        line = line.strip().split('\t')
        readID = line[0]
        if readID in list(df.index):
            if line[3] == 'CDS':
                if readID in annoDict:
                    print(readID)
                    print('multiple cds')
                annoDict[readID] = unpackQuals(line[7])
    inFile.close()
    annoDf = pd.DataFrame.from_dict(annoDict)
    annoDf = annoDf.T
    #Export sums of annotations on different contigs
    sumDict = {}
    for s in df:
        if s == 'sequence':
            continue
        for i in list(df.index):
            if df[s][i] == 1:
                if s not in sumDict:
                    sumDict[s] = {}
                if i in list(annoDf.index):
                    product = annoDf['product'][i]
                else:
                    product = 'N/A'
                if product not in sumDict[s]:
                    sumDict[s][product] = 0
                sumDict[s][product] += 1
    sumDf = pd.DataFrame.from_dict(sumDict)
    sumDf.to_csv(outFile, sep = '\t')
    return sumDf

In [50]:
####
#Output a summary of CDS hits for each binary mapping in a pandas-compatible text file
binaryFile = blastDir + subDir + 'mapped_D17-102065_binary.txt'
annoFile = blastDir + subDir + 'differentially_mapped_best_coding.txt'
outFile = blastDir + subDir + 'mapped_D17-102065_summary.txt'

sumDf = summarizeHits(binaryFile, annoFile, outFile)
sumDf

NB501288_148_H2MVCBGX2:2:22203:14909:16420#GGACTCCTGGGCTTAG__1
multiple cds


Unnamed: 0,D17-102036,D17-102037,D17-102038,D17-102039,D17-102040,D17-102041,D17-102042,D17-102043,D17-102044,D17-102045,D17-102046,D17-102047,D17-102048,D17-102049,D17-102050,D17-102051,D17-102065
1-pyrroline-5-carboxylate dehydrogenase,,,,,,,,,,,,,,,,,4.0
"2,5-didehydrogluconate reductase",,,,,,,,,,,,,,,,,2.0
2-oxoacid:ferredoxin oxidoreductase subunit beta,,,,,,,,,,,,,,,,,2.0
"4Fe-4S iron sulfur cluster binding s, NifH/frxC family protein",,,,,,,,,,,,,,,,,1.0
6-phosphogluconolactonase,,,,,,,,,,,,,,,,,6.0
ABC transporter ATP-binding protein,,,,,,,,,,,,,,,,,2.0
ACP S-malonyltransferase,,,,,,,,,,,,,,,,,1.0
ATP-dependent Clp protease ATP-binding subunit ClpA,,,,,,,,,,,,,,,,,1.0
ATP-dependent helicase HrpB,,,,,,,,,,,,,,,,,1.0
Acyl-CoA dehydrogenase,,,,,,,,,,,,,,,,,3.0


# Read placements within de novo assemblies

In [51]:
#reads: list of reads to annotate
#alignmentFile: alignment to contigs
#prokkaFile: prokka-annotated contigs
def importAlignment(reads, alignmentFile):
    annoDf = pd.DataFrame(index = reads, columns = ['ref_id', 'ref_pos'])
    sam = pysam.AlignmentFile(alignmentFile, "r")
    for r in sam:
        if r.flag & 4:
            continue
        refName = sam.get_reference_name(r.tid)
        if (r.qname + '__1' in reads) and (r.flag & 64):
            annoDf['ref_id'][r.qname + '__1'] = refName
            annoDf['ref_pos'][r.qname + '__1'] = r.pos
        elif (r.qname + '__2' in reads) and (r.flag & 128):
            annoDf['ref_id'][r.qname + '__2'] = refName
            annoDf['ref_pos'][r.qname + '__2'] = r.pos
    return annoDf

def getAnnotation(prokkaFile, node, pos):
    records = list(SeqIO.parse(prokkaFile, 'genbank'))
    passedMapping = False
    for r in records:
        if r.id == node:
            prevFeature = np.nan
            for f in r.features:
                if f.type == 'source':
                    continue
                start = str(f.location).split(':')[0].replace('[', '')
                end = str(f.location).split(':')[1].split(']')[0]
                strand = str(f.location).split('(')[1].replace(')', '')
                #Look for a coding feature
                if (pos > int(start)) and (pos < int(end)):
                    return {'coding': f}
                #Look for intergenic
                if (pos < int(start)) and (pos < int(end)):
                    if isinstance(prevFeature, float):
                        prevStrand = '+'
                    else:
                        prevStrand = str(prevFeature.location).split('(')[1].replace(')', '')
                    if (prevStrand == '+') and (strand == '+'):
                        return {'downstream1': prevFeature, 'upstream1': f}
                    elif (prevStrand == '-') and (strand == '-'):
                        return {'downstream1': f, 'upstream1': prevFeature}
                    elif (prevStrand == '+') and (strand == '-'):
                        return {'upstream1': prevFeature, 'upstream2': f}
                    else:
                        return {'downstream1': prevFeature, 'downstream2': f}
                #store prev
                prevFeature = f

def summarizeAnnotation(featureDf, i, featureDict, flag):
    if flag in featureDict:
        if isinstance(featureDict[flag], float):
            return featureDf
        quals = featureDict[flag].qualifiers
        if 'gene' in quals:
            featureDf[flag][i] = quals['gene'][0]
        elif 'product' in quals:
            featureDf[flag][i] = quals['product'][0]
        else:
            print('No gene/product:')
            print(quals)
    return featureDf

In [None]:
for f in os.listdir(unmappedDir + subDir):
    if 'mapped_' + mappedID in f:
        contigID = f.split('_')[1]
    else:
        continue
    if os.path.exists(annoDir + mappedID + '_' + contigID + '_annotations.txt'):
        continue
    prokkaFile = prokkaDir + contigID + '.gb'
    annoDf = importAlignment(list(df.index), '11_bwa/subA/contig_' + contigID + '_mapped_' + mappedID + '.sam')
    annoDf['coding'] = np.nan
    annoDf['downstream1'] = np.nan
    annoDf['downstream2'] = np.nan
    annoDf['upstream1'] = np.nan
    annoDf['upstream2'] = np.nan
    for i in list(annoDf.index):
        if isinstance(annoDf['ref_id'][i], str):
            featureDict = getAnnotation(prokkaFile, annoDf['ref_id'][i], annoDf['ref_pos'][i])
        else:
            continue
        if not isinstance(featureDict, dict): #protect from null values, no annotation
            continue
        annoDf = summarizeAnnotation(annoDf, i, featureDict, 'coding')
        annoDf = summarizeAnnotation(annoDf, i, featureDict, 'downstream1')
        annoDf = summarizeAnnotation(annoDf, i, featureDict, 'downstream2')
        annoDf = summarizeAnnotation(annoDf, i, featureDict, 'upstream1')
        annoDf = summarizeAnnotation(annoDf, i, featureDict, 'upstream2')
    annoDf.to_csv(annoDir + mappedID + '_' + contigID + '_annotations.txt', sep='\t')

:'LOCUS       NODE_1_length_318532_cov_30.8598 318532 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_294520_cov_33.7766 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_274993_cov_27.222 274993 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_246239_cov_33.1436 246239 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_234615_cov_30.4856 234615 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_227198_cov_27.0193 227198 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_217956_cov_29.2119 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_206622_cov_33.0935 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_187500_cov_29.2806 187500 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_178877_cov_30.0855 178877 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_174263_cov_26.5403 174263 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_159844_cov_31

:'LOCUS       NODE_41_length_49499_cov_30.8857 49499 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_42_length_49295_cov_31.5507 49295 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_43_length_49237_cov_33.501 49237 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_44_length_47907_cov_29.2796 47907 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_45_length_47324_cov_31.1392 47324 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_46_length_43566_cov_31.9217 43566 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_47_length_39367_cov_30.2928 39367 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_48_length_37854_cov_33.8035 37854 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_49_length_37314_cov_30.9769 37314 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_50_length_35024_cov_32.3461 35024 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_51_length_34808_cov_34.0723 34808 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_52_length_34469_cov_29.7561 34469 bp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
:'LOCUS       NODE_1_length_453946_cov_48.2576 453946 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_415049_cov_49.2747 415049 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_318077_cov_39.4191 318077 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_294520_cov_48.7704 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_291544_cov_42.0339 291544 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_232762_cov_46.1498 232762 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_217956_cov_43.1068 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_206622_cov_48.2224 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_203517_cov_45.7756 203517 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length

:'LOCUS       NODE_28_length_78503_cov_43.6146 78503 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_29_length_76161_cov_46.3782 76161 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_30_length_73199_cov_42.6729 73199 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_31_length_72266_cov_50.5558 72266 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_32_length_65575_cov_44.9022 65575 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_33_length_65481_cov_41.5774 65481 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_34_length_64750_cov_38.8131 64750 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_35_length_63862_cov_49.6423 63862 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_36_length_58575_cov_42.4132 58575 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_37_length_56522_cov_47.0696 56522 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_56417_cov_45.1772 56417 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_54359_cov_47.1963 54359 b

:'LOCUS       NODE_1_length_315936_cov_36.4331 315936 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_294264_cov_42.3093 294264 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_259367_cov_39.0721 259367 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_246826_cov_41.6668 246826 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_237362_cov_40.0505 237362 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_227556_cov_34.2819 227556 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_217956_cov_38.1512 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_206622_cov_41.8376 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_200610_cov_41.3445 200610 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_179825_cov_41.5838 179825 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_167818_cov_37.6233 167818 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_158519_cov_4

:'LOCUS       NODE_29_length_78503_cov_37.8074 78503 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_30_length_76165_cov_40.0258 76165 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_31_length_73220_cov_37.6822 73220 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_32_length_71583_cov_39.5249 71583 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_33_length_69407_cov_39.222 69407 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_34_length_65561_cov_38.4049 65561 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_35_length_65468_cov_36.4276 65468 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_36_length_63862_cov_41.8518 63862 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_37_length_59547_cov_42.0827 59547 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_58188_cov_32.8152 58188 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_56349_cov_40.2253 56349 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_40_length_56296_cov_40.3579 56296 bp

:'LOCUS       NODE_1_length_337341_cov_29.002 337341 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_323556_cov_35.7701 323556 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_315936_cov_30.5997 315936 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_294520_cov_35.2898 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_271483_cov_33.5907 271483 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_209079_cov_33.8721 209079 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_208885_cov_34.1636 208885 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_206622_cov_35.2254 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_199472_cov_32.4883 199472 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_182210_cov_33.3624 182210 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_179714_cov_35.6734 179714 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_174281_cov_31

:'LOCUS       NODE_1_length_294520_cov_34.7801 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_291543_cov_28.8454 291543 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_256364_cov_34.529 256364 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_218474_cov_27.145 218474 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_217957_cov_31.0371 217957 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_208989_cov_33.0808 208989 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_206622_cov_33.7214 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_206070_cov_31.2024 206070 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_203524_cov_32.1565 203524 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_199475_cov_30.8695 199475 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_196308_cov_32.5123 196308 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_182096_cov_32.

:'LOCUS       NODE_37_length_56337_cov_33.125 56337 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_55539_cov_30.7293 55539 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_54267_cov_34.1654 54267 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_40_length_53619_cov_34.9301 53619 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_41_length_53011_cov_23.0675 53011 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_42_length_49699_cov_32.0195 49699 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_43_length_49587_cov_32.1312 49587 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_44_length_48973_cov_35.9684 48973 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_45_length_47120_cov_28.2586 47120 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_46_length_43647_cov_32.8461 43647 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_47_length_39732_cov_33.3885 39732 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_48_length_39365_cov_31.7158 39365 bp

:'LOCUS       NODE_1_length_415054_cov_37.1278 415054 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_339455_cov_36.5328 339455 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_291539_cov_31.7338 291539 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_286123_cov_37.461 286123 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_218511_cov_30.6757 218511 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_217956_cov_33.6832 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_209074_cov_34.8486 209074 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_203029_cov_35.5282 203029 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_199466_cov_34.178 199466 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_180028_cov_36.7664 180028 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_159371_cov_36.5199 159371 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_153749_cov_34.

:'LOCUS       NODE_37_length_64894_cov_29.2382 64894 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_64588_cov_32.5661 64588 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_63862_cov_37.6145 63862 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_40_length_55616_cov_33.1206 55616 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_41_length_55519_cov_34.0865 55519 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_42_length_49550_cov_34.8016 49550 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_43_length_47912_cov_33.7528 47912 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_44_length_47237_cov_36.1021 47237 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_45_length_47018_cov_32.5891 47018 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_46_length_39365_cov_34.3663 39365 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_47_length_38251_cov_37.3259 38251 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_48_length_37312_cov_36.2633 37312 b

:'LOCUS       NODE_1_length_339484_cov_38.6841 339484 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_337275_cov_32.9813 337275 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_316047_cov_34.8034 316047 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_294520_cov_40.8697 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_256394_cov_39.8831 256394 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_252038_cov_36.8852 252038 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_219155_cov_40.285 219155 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_217956_cov_36.5432 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_217607_cov_37.0381 217607 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_209081_cov_37.7848 209081 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_207142_cov_40.127 207142 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_206733_cov_39.

:'LOCUS       NODE_33_length_59519_cov_40.147 59519 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_34_length_56337_cov_40.2365 56337 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_35_length_55616_cov_36.1192 55616 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_36_length_55508_cov_37.7379 55508 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_37_length_54274_cov_39.5412 54274 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_53619_cov_39.5288 53619 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_49922_cov_36.3538 49922 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_40_length_49781_cov_37.8543 49781 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_41_length_49286_cov_40.1545 49286 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_42_length_47143_cov_33.464 47143 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_43_length_46621_cov_39.6365 46621 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_44_length_43592_cov_39.5811 43592 bp 

:'LOCUS       NODE_1_length_415050_cov_38.4749 415050 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_318539_cov_36.6346 318539 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_315936_cov_33.1247 315936 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_294520_cov_38.0828 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_280747_cov_30.8101 280747 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_244104_cov_38.2804 244104 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_217956_cov_35.1242 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_206622_cov_38.1119 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_203524_cov_36.8432 203524 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_199475_cov_35.1062 199475 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_180041_cov_38.5857 180041 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_178867_cov_3

:'LOCUS       NODE_1_length_282643_cov_34.6171 282643 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_244214_cov_29.2389 244214 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_232738_cov_32.3255 232738 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_218420_cov_26.8809 218420 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_217956_cov_29.8922 217956 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_209087_cov_31.8185 209087 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_206622_cov_33.0021 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_187500_cov_30.6653 187500 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_178912_cov_31.0679 178912 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_164849_cov_34.4774 164849 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_159941_cov_32.5824 159941 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_159004_cov_2

:'LOCUS       NODE_33_length_77446_cov_34.54 77446 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_34_length_76771_cov_30.5378 76771 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_35_length_73222_cov_30.4016 73222 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_36_length_71587_cov_30.9641 71587 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_37_length_65556_cov_30.597 65556 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_65498_cov_28.7915 65498 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_64885_cov_25.8652 64885 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_40_length_63862_cov_34.1784 63862 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_41_length_62291_cov_23.6822 62291 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_42_length_59524_cov_33.0123 59524 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_43_length_58572_cov_29.8153 58572 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_44_length_56422_cov_32.0764 56422 bp D

:'LOCUS       NODE_1_length_410436_cov_31.8567 410436 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_2_length_339451_cov_40.2707 339451 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_3_length_315936_cov_33.7714 315936 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_4_length_294520_cov_41.2544 294520 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_5_length_256413_cov_40.8435 256413 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_6_length_244288_cov_40.7438 244288 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_7_length_241607_cov_37.4969 241607 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_8_length_217958_cov_35.0108 217958 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_9_length_209081_cov_37.112 209081 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_10_length_208878_cov_38.6661 208878 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_11_length_206622_cov_39.9794 206622 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_12_length_199460_cov_36

:'LOCUS       NODE_32_length_63862_cov_41.81 63862 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_33_length_59523_cov_37.4874 59523 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_34_length_58571_cov_33.6449 58571 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_35_length_55529_cov_34.9291 55529 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_36_length_54674_cov_34.622 54674 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_37_length_54335_cov_39.3631 54335 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_38_length_49931_cov_35.3923 49931 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_39_length_49593_cov_37.3117 49593 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_40_length_47914_cov_35.2885 47914 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_41_length_39711_cov_39.4545 39711 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_42_length_39367_cov_35.8439 39367 bp DNA BCT linear 07-APR-2017\n'
:'LOCUS       NODE_43_length_37319_cov_36.1255 37319 bp D

In [14]:
def quantifyAnnotations(inFile):
    print(inFile)
    annoDf = pd.DataFrame.from_csv(inFile, sep = '\t')
    print(annoDf['downstream1'].value_counts())
    print('')
    
#Summarize annotations
for f in os.listdir(unmappedDir + subDir):
    if 'mapped_' + mappedID in f:
        contigID = f.split('_')[1]
    else:
        continue
    inFile = annoDir + mappedID + '_' + contigID + '_annotations.txt'
    quantifyAnnotations(inFile)

14_bwa_annotations/D17-102065_D17-102037_annotations.txt
hypothetical protein    91
tRNA-Lys                 5
tRNA-Leu                 4
tRNA-Ala                 4
betB_2                   3
tRNA-Gly                 3
23S ribosomal RNA        2
dmlR_22                  2
rhsC_5                   1
pobB                     1
hcpA_3                   1
Name: downstream1, dtype: int64

14_bwa_annotations/D17-102065_D17-102047_annotations.txt
phoB_2                  18
tRNA-Lys                 7
tRNA-Leu                 4
tRNA-Gly                 4
tRNA-Ala                 4
betB_1                   3
hypothetical protein     3
23S ribosomal RNA        2
dmlR_23                  2
rhsC_7                   1
pobB                     1
hcpA_2                   1
Name: downstream1, dtype: int64

14_bwa_annotations/D17-102065_D17-102044_annotations.txt
hypothetical protein    97
phoB_2                  18
tRNA-Lys                 5
tRNA-Leu                 4
tRNA-Gly                 3
dmlR_11

14_bwa_annotations/D17-102065_D17-102037_annotations.txt
hypothetical protein    91
tRNA-Lys                 5
tRNA-Leu                 4
tRNA-Ala                 4
betB_2                   3
tRNA-Gly                 3
23S ribosomal RNA        2
dmlR_22                  2
rhsC_5                   1
pobB                     1
hcpA_3                   1
Name: downstream1, dtype: int64

14_bwa_annotations/D17-102065_D17-102047_annotations.txt
phoB_2                  18
tRNA-Lys                 7
tRNA-Leu                 4
tRNA-Gly                 4
tRNA-Ala                 4
betB_1                   3
hypothetical protein     3
23S ribosomal RNA        2
dmlR_23                  2
rhsC_7                   1
pobB                     1
hcpA_2                   1
Name: downstream1, dtype: int64

14_bwa_annotations/D17-102065_D17-102044_annotations.txt
hypothetical protein    97
phoB_2                  18
tRNA-Lys                 5
tRNA-Leu                 4
tRNA-Gly                 3
dmlR_11

# Discarded code

In [None]:
split = 1000
diffFile = blastDir + subDir + 'differentially_mapped_reads.fa'
j = 0
outFileName = diffFile.replace('.fa', '_' + j + '.fa')
with open(diffFile, 'r') as f:
    i = 0
    for line in f:
        if (i != 0) and (i % 1000 == 0):
            outFile.close()
            outFileName = outFileName.replace('_' + str(j) + '.fa', '_' + str(j+1) + '.fa')
            outFile = open(outFileName, 'w')
            j += 1
        
        i += 1

####
#Efficiently store features by location
for inFile in os.listdir(gbDir):
    df = pd.DataFrame(columns = ['end', 'strand', 'type', 'qualifiers'])
    records = list(SeqIO.parse(inFile, 'genbank'))
    for r in records:
        if f.type == 'source':
            continue
        if f.type == 'CDS':
            
        else:
            print(f.type)
        
        prevFeature = np.nan
        for f in r.features:
            if f.type == 'source':
                continue
            start = str(f.location).split(':')[0].replace('[', '')
            end = str(f.location).split(':')[1].split(']')[0]
            strand = str(f.location).split('(')[1].replace(')', '')

####
#Summarize blast hits by extracting features from downloaded genbank files
#Export to tab-delimited file matching readIDs to coding/upstream/downstream gene names
blastFile = blastDir + subDir + 'differentially_mapped_reads.out'
outFile = blastDir + subDir + 'differentially_mapped_reads.txt'
commandString = './scripts/gbFeatures.py -i ' + blastFile + ' -g ' + gbDir + ' -o ' + outFile
print('Run this command in the background:\n')
print(commandString)