In [1]:
import pandas as pd
import fetch
import map_ids
from itertools import takewhile
import sys

In [18]:
def update_taxids(merged, accessions):
    for accession in accessions:
        if int(accessions[accession]) in merged:
            accessions[accession] = str(merged[int(accessions[accession])])
    return accessions


def map_gg_otumap_ids2taxids(gg_otumap_orig, gg_ids2taxids, gg_ids_accessions):
    gg_otumap_taxid = {}
    for idx in gg_otumap_orig:
        gg_otumap_taxid[idx] = set([map_gg_id2taxid(idx, gg_ids2taxids, gg_ids_accessions)] + 
                                   [map_gg_id2taxid(jdx, gg_ids2taxids, gg_ids_accessions) for jdx in gg_otumap_orig[idx]])
    return gg_otumap_taxid


def map_gg_id2taxid(gg_id, gg_ids2taxids, gg_ids_accessions):
    # step 1: map GreenGenes ID to a Genbank or IMG accession
    accession = None
    for db_type in gg_ids_accessions:
        if str(gg_id) in gg_ids_accessions[db_type]:
            accession = gg_ids_accessions[db_type][str(gg_id)]
            break
    
    # step 2: map the accession to a NCBI taxonomy ID
    taxid = int(gg_ids2taxids[accession])
    
    return taxid



def mp_mp_clades2lca(mp_markers, mp_ids2taxids, nodes):
    clades = {}
    
    for clade in mp_markers:
        # step 1: maps accessions to NCBI taxonomy IDs
        taxids = set()
        for db_type in mp_markers[clade]:
            for mp_id in mp_markers[clade][db_type]:
                taxids.add(map_mp_id2taxid(mp_id, mp_ids2taxids))

        # step 2: obtain full lineage for each taxonomy ID
        lineages = []
        for taxid in taxids:
            lineage = get_lineage(int(taxid), nodes)
            if lineage != []:
                lineages.append(lineage)
        
        # step 3: find taxonomy ID of the lowest common ancestor
        lca = get_lca(lineages)
        suffixes = set()
        for l in lineages:
            for taxid in l[len(lca):]:
                suffixes.add(taxid)

        clades[clade] = {'lca': lca,
                         'suffixes': suffixes}
    return clades


def map_mp_id2taxid(mp_id, mp_ids2taxids):
    return mp_ids2taxids[mp_id]

def read_mp_output(filename):
    tree = {}
    try:
        file = open(filename, 'r')
        file.readline()  # header
        for line in file:
            linStr, abundance = line.rstrip().split('\t')
            for t in tree:
                if t in linStr:
                    del tree[t]
                    break
            tree[linStr] = float(abundance)

        file.close()
        return tree

    except IOError:
        print('Cannot read file')


def parse_ncbi_merged(filename):
    merged = {}
    try:
        file = open(filename, 'r')
        for line in file:
            fields = line.rstrip().split('\t|')
            old_id, new_id = int(fields[0].rstrip()), int(fields[1])
            merged[old_id] = new_id
            
        file.close()
        return merged

    except IOError:
        print('Cannot read file')

    
def parse_ncbi_nodes(filename, abort_after_lines=20):
    nodes = {}
    try:
        file = open(filename, 'r')
        readlines = 0
        for line in file:
            readlines += 1
            fields = line.split('\t|\t')
            nodes[int(fields[0])] = int(fields[1])

            if (abort_after_lines is not None) and \
               (readlines >= abort_after_lines):
                break

        file.close()
        return nodes

    except IOError:
        print('Cannot read file')

def get_lineage(taxid, taxonomy):
    lineage = [taxid]
    if taxid not in taxonomy:
        print('%s not in taxonomy' % taxid, file=sys.stderr)
        return []
    else:
        while taxonomy[lineage[-1]] != lineage[-1]:
            lineage.append(taxonomy[lineage[-1]])
        return list(reversed(lineage))

def get_lca(lineages):
    def allsame(x):
        return len(set(x)) == 1

    return [i[0] for i in takewhile(allsame, zip(*lineages))]


In [15]:
# NCBI Taxonomy
nodes = parse_ncbi_nodes('/home/sjanssen/GreenGenes/NCBItaxonomy/nodes.dmp', abort_after_lines=None)
merged = parse_ncbi_merged('/home/sjanssen/GreenGenes/NCBItaxonomy/merged.dmp')


# Green Genes
gg_ids2taxids = {}
gg_ids2taxids = fetch.read_accesion_taxids('/home/sjanssen/GreenGenes/final_taxid_gg_Genbank.txt', gg_ids2taxids)
gg_ids2taxids = fetch.read_accesion_taxids('/home/sjanssen/GreenGenes/final_taxid_gg_IMG.txt', gg_ids2taxids)
gg_ids2taxids = update_taxids(merged, gg_ids2taxids)

gg_ids_accessions = fetch.parse_gg_accessions('/home/sjanssen/GreenGenes/gg_13_5_accessions.txt', abort_after_lines=None)

gg_otumap_97_orig = map_ids.parse_gg_otu_map('/home/sjanssen/GreenGenes/gg_13_5_otus/otus/97_otu_map.txt', abort_after_lines=None)


# Metaphlan
o2 = mp_ids2taxids = {}
mp_ids2taxids = fetch.read_accesion_taxids('/home/sjanssen/GreenGenes/final_taxid_metaphlan_gene.txt', mp_ids2taxids)
mp_ids2taxids = fetch.read_accesion_taxids('/home/sjanssen/GreenGenes/final_taxid_metaphlan_gi.txt', mp_ids2taxids)
mp_ids2taxids = fetch.read_accesion_taxids('/home/sjanssen/GreenGenes/final_taxid_metaphlan_nc.txt', mp_ids2taxids)
mp_ids2taxids = update_taxids(merged, mp_ids2taxids)

mp_markers = map_ids.parse_metaphlan_markers_info('/home/sjanssen///GreenGenes/Metaphlan/markers_info.txt', abort_after_lines=None)





In [19]:
gg97 = map_gg_otumap_ids2taxids(gg_otumap_97_orig, gg_ids2taxids, gg_ids_accessions)
mp_clades = mp_mp_clades2lca(mp_markers, mp_ids2taxids, nodes)

1354495 not in taxonomy
-1 not in taxonomy


In [5]:
f = open('/home/sjanssen/GreenGenes/res.txt', 'w')

c = 0
for clade in mp_clades:
    c += 1
    lineage = mp_clades[clade]
    
    l = []
    for otu_id in gg97:
        for taxid in gg97[otu_id]:
            if taxid in lineage:
                l.append(otu_id)
                break
    f.write("%i\t%s\t%s\t%s\n" % (len(l), l, clade, lineage))
    
    if c > 10000:
        break

f.close()

In [47]:
import pandas as pd
tax = pd.read_csv('/home/sjanssen/GreenGenes/gg_13_5_otus/taxonomy/97_otu_taxonomy.txt', sep="\t", header=None, names=['otuid','lineage'], index_col=0)

In [50]:
list(tax.loc[[580377, 86772, 1028133, 89039, 12520, 776873, 1134896, 566396],'lineage'])

['k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__casei',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__casei',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__casei',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__aureum',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__aureum',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__',
 'k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Brevibacteriaceae; g__Brevibacterium; s__']

In [45]:
filename = '/home/sjanssen/10370_Skin-Amina/MetaTranscriptomics/stephan/armpit_v6_t3.bam.f13_r1.fq.bacprofile'
tree = read_mp_output(filename)

# tree.keys()
for t in sorted(tree):
    marker = None
    for c in reversed(t.split("|")):
        if '_unclassified' in c:
            c = c[:-len('_unclassified')]
        if c in mp_clades:
            marker = c
            break
    print("%s\t%f\t%s" % (marker, tree[t], ""))

    # look for suffixes:
    found_otus = []
    for taxid in mp_clades[marker]['suffixes']:
        if match_taxid_to_otuid(taxid, gg97) != []:
            found_otus.extend(match_taxid_to_otuid(taxid, gg97))
    if found_otus == []:
        for taxid in mp_clades[marker]['lca']:
            if match_taxid_to_otuid(taxid, gg97) != []:
                found_otus.extend(match_taxid_to_otuid(taxid, gg97))
    print("\t", found_otus)
#     break
    
#     for otu_id in gg97:
#         for taxid in gg97[otu_id]:
#             if taxid in reversed(mp_clades[marker]):
#                 print("\t%s\t%s" % (taxid, mp_clades[marker]))
#                 break
                
#     print()
#     break



f__Brevibacteriaceae	0.098970	
	 [580377, 86772, 1028133, 89039, 12520, 776873, 1134896, 566396]
s__Corynebacterium_pseudogenitalium	9.658200	
	 [1016547]
s__Corynebacterium_tuberculostearicum	19.351940	
	 [13493, 1040713]
f__Propionibacteriaceae	0.229420	
	 []
s__Propionibacterium_acnes	0.583130	
	 [586355, 567406, 4382956, 4121939, 4447394]
s__Propionibacterium_granulosum	0.384200	
	 [1074945, 386088]
s__Bacillus_licheniformis	0.025270	
	 [4317009, 2579228, 4466538, 264981, 4477551, 807830, 36125, 36519, 38338, 564208, 574051, 576641, 1113935, 346842, 4473249, 585279, 110248, 4310119, 4317009, 132112, 4328259, 332283, 689888, 174351, 180719, 187838, 191576, 210559, 3365024, 222397, 236385, 247284, 511562, 4443986, 4447476, 1040601, 4449418, 261270]
s__Bacillus_megaterium	1.248610	
	 [1140833]
s__Bacillus_subtilis	0.565010	
	 [270613, 275153, 276934, 542842, 24734, 286880, 820837, 821594, 301350, 40827, 576466, 96288, 927604, 1113935, 1117992, 4266982, 339020, 14757, 1145081, 890547, 

In [51]:
gg97[524292]

{77133, 189668}

In [20]:
get_lineage(38303, nodes), mp_clades['s__Corynebacterium_pseudogenitalium']

([1, 131567, 2, 1783272, 201174, 1760, 85007, 1653, 1716, 38303],
 {'lca': [1, 131567, 2, 1783272, 201174, 1760, 85007, 1653, 1716],
  'suffixes': {38303,
   525264,
   1203557,
   1203558,
   1203559,
   1203560,
   1203563,
   1203564,
   1203566,
   1203567}})

In [10]:
a = [1,2,3,4]
b = [1,2,3,4,5,6,7]

b[len(a):]

[5, 6, 7]

In [26]:
def match_taxid_to_otuid(query, gg_map):
    matching_otu_ids = []
    
    for otuid in gg_map:
        if query in gg_map[otuid]:
            matching_otu_ids.append(otuid)
    
    return matching_otu_ids

match_taxid_to_otuid(38300003, gg97)

[]