In [1]:
from ete2 import NCBITaxa
from Bio import Entrez
from collections import OrderedDict
import pandas as pd
import json
import re

In [2]:
ncbi = NCBITaxa()

Update taxonomy database, might takes a few minutes...

In [3]:
#ncbi.update_taxonomy_database()

#### Insert the root of the taxonomy to start retrieving information from

In [4]:
organism = "Teuthida"

In [5]:
taxid2name = ncbi.get_name_translator([organism])
taxid2name

{'Teuthida': [551290]}

In [6]:
organism_taxid = taxid2name[organism][0]
organism_taxid

551290

Available methods
- NCBITaxa.get_rank()
- NCBITaxa.get_lineage()
- NCBITaxa.get_taxid_translator()
- NCBITaxa.get_name_translator()
- NCBITaxa.translate_to_names()

In [7]:
descendants = ncbi.get_descendant_taxa(organism, intermediate_nodes=True)
print ncbi.translate_to_names(descendants[:10])

[u'Loligo sp.', u'environmental samples', u'Dosidicus gigas environmental sample', u'Loliolus', u'Teuthida sp. BOLD:AAB0336', u'Teuthida sp. BOLD:AAD8239', u'Loliolus uyii', u'Sthenoteuthis sp. 1 SS-2016', u'Sthenoteuthis sp. 2 SS-2016', u'Sthenoteuthis sp. 3 SS-2016']


In [8]:
print("Ci sono {} nodi nella tassonomia dei {}".format(len(descendants), organism))

Ci sono 614 nodi nella tassonomia dei Teuthida


In [9]:
ancestor_ranks = ncbi.get_lineage(organism_taxid)
ancestor_ranks

[1,
 131567,
 2759,
 33154,
 33208,
 6072,
 33213,
 33317,
 1206795,
 6447,
 6605,
 6606,
 215449,
 215450,
 551290]

In [10]:
full_ranks = ncbi.get_rank(ancestor_ranks + descendants)
full_ranks[1] = u'root' # if not it is 'no rank'

#ranks = ncbi.get_rank(descendants)
ranks = ncbi.get_rank(descendants + [organism_taxid])# include self

full_ranks

{1: u'root',
 2759: u'superkingdom',
 6072: u'no rank',
 6447: u'phylum',
 6605: u'class',
 6606: u'subclass',
 6615: u'family',
 6616: u'genus',
 6618: u'species',
 6622: u'species',
 6623: u'family',
 6624: u'genus',
 6625: u'species',
 6626: u'family',
 6627: u'genus',
 6628: u'species',
 6629: u'species',
 6630: u'genus',
 6631: u'species',
 6632: u'genus',
 6636: u'genus',
 6637: u'species',
 33154: u'no rank',
 33208: u'kingdom',
 33213: u'no rank',
 33317: u'no rank',
 34542: u'suborder',
 34543: u'genus',
 34544: u'species',
 34545: u'genus',
 34546: u'species',
 34547: u'genus',
 34548: u'species',
 34549: u'genus',
 34550: u'species',
 34551: u'species',
 34552: u'genus',
 34553: u'species',
 34554: u'family',
 34555: u'genus',
 34556: u'species',
 34557: u'family',
 34558: u'genus',
 34559: u'species',
 34560: u'family',
 34561: u'genus',
 34562: u'species',
 34563: u'genus',
 34564: u'species',
 34565: u'family',
 34566: u'genus',
 34567: u'species',
 34569: u'genus',
 3457

#### Build dictionary of taxid and the its name
e.g. {551290: u'Teuthida'}

In [11]:
taxid_translator = {}
for taxid in full_ranks:
    taxid_translator[taxid] = ncbi.get_taxid_translator([taxid])[taxid]
taxid_translator

{1: u'root',
 2759: u'Eukaryota',
 6072: u'Eumetazoa',
 6447: u'Mollusca',
 6605: u'Cephalopoda',
 6606: u'Coleoidea',
 6615: u'Loliginidae',
 6616: u'Loligo',
 6618: u'Loligo forbesii',
 6622: u'Loligo vulgaris',
 6623: u'Enoploteuthidae',
 6624: u'Watasenia',
 6625: u'Watasenia scintillans',
 6626: u'Ommastrephidae',
 6627: u'Illex',
 6628: u'Illex argentinus',
 6629: u'Illex illecebrosus',
 6630: u'Nototodarus',
 6631: u'Nototodarus gouldi',
 6632: u'Ommastrephes',
 6636: u'Todarodes',
 6637: u'Todarodes pacificus',
 33154: u'Opisthokonta',
 33208: u'Metazoa',
 33213: u'Bilateria',
 33317: u'Protostomia',
 34542: u'Oegopsina',
 34543: u'Pterygioteuthis',
 34544: u'Pterygioteuthis microlampas',
 34545: u'Enoploteuthis',
 34546: u'Enoploteuthis reticulata',
 34547: u'Abraliopsis',
 34548: u'Abraliopsis sp.',
 34549: u'Abralia',
 34550: u'Abralia trigonura',
 34551: u'Nototodarus hawaiiensis',
 34552: u'Sthenoteuthis',
 34553: u'Sthenoteuthis oualaniensis',
 34554: u'Architeuthidae',
 

#### Create a dictionary of lineage for each taxid we fetched
It is structured in the following way: <br>
TAXID: {GRUPPO_TASSONOMICO: ISTANZA} for each rank<br>
e.g. 
  * {551290: 
    * {..., 
        * 'infraclass': 'Neocoleoidea'},
            * {'superorder', 'Decapodiformes'},
                * {order: Teuthida}}

In [12]:
# taxid_lineage = {}
# for taxid, rank in ranks.items():
#     taxid_lineage[taxid] = OrderedDict()
#     count_noranks = 0
#     for ancestor_id in ncbi.get_lineage(key):
#         lineage_level_name = taxid_translator[ancestor_id]
#         lineage_instance = full_ranks[ancestor_id] # u'Teuthida, u'Cephalopoda, etc...
#         # do not override no rank keys !
#         if lineage_instance == u'no rank':
#             if count_noranks > 0:
#                 lineage_instance += '_{}'.format(str(count_noranks)) #no rank_1
#             count_noranks += 1
#         taxid_lineage[taxid][lineage_instance] = lineage_level_name # set e.g. {u'superkingdom: u'Eukaryota'}
# taxid_lineage

In [13]:
taxid_lineage = {}
for taxid, rank in ranks.items():
    taxid_lineage[taxid] = OrderedDict()
    count_noranks = 0
    count_consecutive_noranks = 1 # e' il primo no rank consecutivo --> e' la prima volta che occorre
    was_norank = False
    for i, ancestor_id in enumerate(ncbi.get_lineage(taxid)):
        lineage_level_name = taxid_translator[ancestor_id] # u'Teuthida, u'Cephalopoda, etc...
        lineage_instance = full_ranks[ancestor_id] # order, suborder, ...
        # do not override no rank keys !
        if lineage_instance == u'no rank': # first instance is never no rank, else code will crash
            # if the previous ancestor is not on the same level then reset counter 
            if not was_norank:
                lineage_instance = u'sub_' + taxid_lineage[taxid].items()[i-1][0] # take the upper ancestor
            else:
                # take the upper common ancestor
                lineage_instance = u'sub_{}_{}'.format(taxid_lineage[taxid].items()[i-1-count_consecutive_noranks][0],
                                                        count_consecutive_noranks)
                count_consecutive_noranks += 1
            was_norank = True
        else:
            count_consecutive_noranks = 1
            was_norank = False
        taxid_lineage[taxid][lineage_instance] = lineage_level_name # set e.g. {u'superkingdom: u'Eukaryota'}
taxid_lineage

# PSEUDOCODICE
#count_consecutive_noranks = 1
#per ogni ancestor
    #se ho un no_rank
        #se was_norank == False
            #chiamalo sotto_livello
        #altrimenti
            #chiamalo sotto_livello_${count_consecutive_noranks}
            #count_consecutive_noranks += 1
        #was_norank = True
    #se non ho un no_rank
        #was_norank = False

{6615: OrderedDict([(u'root', u'root'),
              (u'sub_root', u'cellular organisms'),
              (u'superkingdom', u'Eukaryota'),
              (u'sub_superkingdom', u'Opisthokonta'),
              (u'kingdom', u'Metazoa'),
              (u'sub_kingdom', u'Eumetazoa'),
              (u'sub_kingdom_1', u'Bilateria'),
              (u'sub_kingdom_2', u'Protostomia'),
              (u'sub_kingdom_3', u'Lophotrochozoa'),
              (u'phylum', u'Mollusca'),
              (u'class', u'Cephalopoda'),
              (u'subclass', u'Coleoidea'),
              (u'infraclass', u'Neocoleoidea'),
              (u'superorder', u'Decapodiformes'),
              (u'order', u'Teuthida'),
              (u'suborder', u'Myopsina'),
              (u'family', u'Loliginidae')]),
 6616: OrderedDict([(u'root', u'root'),
              (u'sub_root', u'cellular organisms'),
              (u'superkingdom', u'Eukaryota'),
              (u'sub_superkingdom', u'Opisthokonta'),
              (u'kingdom', u

#### Manual part: reorder columns (last columns are the ones that were not present in the first few taxids (greedy deep-first))

In [14]:
columns_set = ()
for k, v in taxid_lineage.items():
    for k1 in v:
        if k1 not in columns_set:
            columns_set = columns_set + (k1,)
# Order is correct until 'order' :)
idx_untilcorrect = columns_set.index("order")
columns_set

(u'root',
 u'sub_root',
 u'superkingdom',
 u'sub_superkingdom',
 u'kingdom',
 u'sub_kingdom',
 u'sub_kingdom_1',
 u'sub_kingdom_2',
 u'sub_kingdom_3',
 u'phylum',
 u'class',
 u'subclass',
 u'infraclass',
 u'superorder',
 u'order',
 u'suborder',
 u'family',
 u'genus',
 u'sub_genus',
 u'species',
 u'sub_family',
 u'sub_suborder',
 u'subspecies',
 u'sub_order')

In [15]:
columns = list(columns_set[:idx_untilcorrect + 1])
columns.extend([u'sub_order', 
                u'suborder', 
                u'sub_suborder', 
                u'family', 
                u'sub_family',
                u'genus',
                u'sub_genus',
                u'species', 
                u'subspecies'])

print("Did I miss any column? {}".format(len(columns) != len(columns_set)))

Did I miss any column? False


In [16]:
dd = pd.DataFrame.from_dict(taxid_lineage, orient = "index", columns=columns)
dd.iloc[:,-10:]

Unnamed: 0,order,sub_order,suborder,sub_suborder,family,sub_family,genus,sub_genus,species,subspecies
6615,Teuthida,,Myopsina,,Loliginidae,,,,,
6616,Teuthida,,Myopsina,,Loliginidae,,Loligo,,,
6618,Teuthida,,Myopsina,,Loliginidae,,Loligo,,Loligo forbesii,
6622,Teuthida,,Myopsina,,Loliginidae,,Loligo,,Loligo vulgaris,
6623,Teuthida,,Oegopsina,,Enoploteuthidae,,,,,
6624,Teuthida,,Oegopsina,,Enoploteuthidae,,Watasenia,,,
6625,Teuthida,,Oegopsina,,Enoploteuthidae,,Watasenia,,Watasenia scintillans,
6626,Teuthida,,Oegopsina,,Ommastrephidae,,,,,
6627,Teuthida,,Oegopsina,,Ommastrephidae,,Illex,,,
6628,Teuthida,,Oegopsina,,Ommastrephidae,,Illex,,Illex argentinus,


In [17]:
filter_col = [col for col in dd if col.startswith('sub_')]
for f in filter_col:
    print("Column '{}' has {} unique value(s): {}".format(f, len(dd[f].unique()), dd[f].unique()))

Column 'sub_root' has 1 unique value(s): [u'cellular organisms']
Column 'sub_superkingdom' has 1 unique value(s): [u'Opisthokonta']
Column 'sub_kingdom' has 1 unique value(s): [u'Eumetazoa']
Column 'sub_kingdom_1' has 1 unique value(s): [u'Bilateria']
Column 'sub_kingdom_2' has 1 unique value(s): [u'Protostomia']
Column 'sub_kingdom_3' has 1 unique value(s): [u'Lophotrochozoa']
Column 'sub_order' has 3 unique value(s): [nan u'unclassified Teuthida' u'environmental samples']
Column 'sub_suborder' has 3 unique value(s): [nan u'unclassified Myopsina' u'environmental samples']
Column 'sub_family' has 3 unique value(s): [nan u'environmental samples' u'unclassified Cranchiidae']
Column 'sub_genus' has 4 unique value(s): [nan u'unclassified Loligo' u'unclassified Illex' u'environmental samples']


In [18]:
# select no_rank columns rooting (starting from) at the chosen organism i.e. avoid ancestors' no ranks
try:
    idx_filter = filter_col.index("sub_" + ncbi.get_rank([organism_taxid])[organism_taxid])
except:
    idx_filter = 0
norank_col = filter_col[idx_filter:]
norank_col

[u'sub_order', u'sub_suborder', u'sub_family', u'sub_genus']

In [19]:
# dataframe with only those organism that have at least one no rank in the lineage
norank_df = dd[dd[norank_col].notnull().any(axis = 1)]

### Create dataset of taxid and related information

In [20]:
# First build a dictionary...
df = {}
for taxid in descendants + [organism_taxid]:
    df[taxid] = {}
    
    specie = ncbi.translate_to_names([taxid])
    rank_dict = ncbi.get_rank([taxid])
    lineage_id = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage_id)
    lineage_name = [names[taxid] for taxid in lineage_id]
    
    df[taxid]['name'] = specie[0]
    df[taxid]['rank'] = rank_dict[taxid]
    df[taxid]['lineage_id'] = '//'.join([str(char) for char in lineage_id])
    df[taxid]['lineage_name'] = '//'.join(lineage_name)
#    df[taxid]['lineage_complete'] = taxid_lineage[taxid]

In [21]:
print(json.dumps(df, indent = 2))

{
  "1091609": {
    "lineage_id": "1//131567//2759//33154//33208//6072//33213//33317//1206795//6447//6605//6606//215449//215450//551290//34542//6626//346248//1091609", 
    "name": "environmental samples", 
    "rank": "no rank", 
    "lineage_name": "root//cellular organisms//Eukaryota//Opisthokonta//Metazoa//Eumetazoa//Bilateria//Protostomia//Lophotrochozoa//Mollusca//Cephalopoda//Coleoidea//Neocoleoidea//Decapodiformes//Teuthida//Oegopsina//Ommastrephidae//Dosidicus//environmental samples"
  }, 
  "1091610": {
    "lineage_id": "1//131567//2759//33154//33208//6072//33213//33317//1206795//6447//6605//6606//215449//215450//551290//34542//6626//346248//1091609//1091610", 
    "name": "Dosidicus gigas environmental sample", 
    "rank": "species", 
    "lineage_name": "root//cellular organisms//Eukaryota//Opisthokonta//Metazoa//Eumetazoa//Bilateria//Protostomia//Lophotrochozoa//Mollusca//Cephalopoda//Coleoidea//Neocoleoidea//Decapodiformes//Teuthida//Oegopsina//Ommastrephidae//Dosidicu

In [22]:
# ... then convert the dictionary to dataframe
data = pd.DataFrame.from_dict(data=df, orient="index")
print(data.head())

                                             lineage_id             name  \
6615  1//131567//2759//33154//33208//6072//33213//33...      Loliginidae   
6616  1//131567//2759//33154//33208//6072//33213//33...           Loligo   
6618  1//131567//2759//33154//33208//6072//33213//33...  Loligo forbesii   
6622  1//131567//2759//33154//33208//6072//33213//33...  Loligo vulgaris   
6623  1//131567//2759//33154//33208//6072//33213//33...  Enoploteuthidae   

         rank                                       lineage_name  
6615   family  root//cellular organisms//Eukaryota//Opisthoko...  
6616    genus  root//cellular organisms//Eukaryota//Opisthoko...  
6618  species  root//cellular organisms//Eukaryota//Opisthoko...  
6622  species  root//cellular organisms//Eukaryota//Opisthoko...  
6623   family  root//cellular organisms//Eukaryota//Opisthoko...  


#### Add ancestor relationship

In [23]:
data['sonof_id'] = None
data['sonof_name'] = None
for index, row in data.iterrows():
    row['sonof_id'] = row['lineage_id'].split('//')[-2] # take father node
    row['sonof_name'] = row['lineage_name'].split('//')[-2] # take father node    
    #row['son_of_(rank_name)'] = data[index, 'son_of']

# Reorder columns
data = data[['name', 'rank', 'sonof_name', 'sonof_id', 'lineage_name', 'lineage_id']] # reorder columns
data.sort_values(by=['lineage_id'], inplace=True) # order rows by lineage id
data.head()

Unnamed: 0,name,rank,sonof_name,sonof_id,lineage_name,lineage_id
551290,Teuthida,order,Decapodiformes,215450,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...
1986411,environmental samples,no rank,Teuthida,551290,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...
1986412,Teuthida environmental sample,species,environmental samples,1986411,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...
2002786,Doryteuthis environmental sample,species,environmental samples,1986411,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...
2002799,Illex environmental sample,species,environmental samples,1986411,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...


#### Create dataframe for full taxonomy (including everything)

In [24]:
full_taxonomy = data.join(dd)
full_taxonomy

Unnamed: 0,name,rank,sonof_name,sonof_id,lineage_name,lineage_id,root,sub_root,superkingdom,sub_superkingdom,...,order,sub_order,suborder,sub_suborder,family,sub_family,genus,sub_genus,species,subspecies
551290,Teuthida,order,Decapodiformes,215450,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,,,,,,,,
1986411,environmental samples,no rank,Teuthida,551290,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,environmental samples,,,,,,,,
1986412,Teuthida environmental sample,species,environmental samples,1986411,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,environmental samples,,,,,,,Teuthida environmental sample,
2002786,Doryteuthis environmental sample,species,environmental samples,1986411,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,environmental samples,,,,,,,Doryteuthis environmental sample,
2002799,Illex environmental sample,species,environmental samples,1986411,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,environmental samples,,,,,,,Illex environmental sample,
34542,Oegopsina,suborder,Teuthida,551290,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,,,,,,
283043,Batoteuthidae,family,Oegopsina,34542,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Batoteuthidae,,,,,
283044,Batoteuthis,genus,Batoteuthidae,283043,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Batoteuthidae,,Batoteuthis,,,
283045,Batoteuthis skolops,species,Batoteuthis,283044,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Batoteuthidae,,Batoteuthis,,Batoteuthis skolops,
34554,Architeuthidae,family,Oegopsina,34542,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Architeuthidae,,,,,


#### Create taxonomy for organism that have at least a no_rank level associated

In [25]:
norank_taxonomy = data.join(norank_df, how='right')
norank_taxonomy

Unnamed: 0,name,rank,sonof_name,sonof_id,lineage_name,lineage_id,root,sub_root,superkingdom,sub_superkingdom,...,order,sub_order,suborder,sub_suborder,family,sub_family,genus,sub_genus,species,subspecies
55716,Loligo sp.,species,unclassified Loligo,184220,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Myopsina,,Loliginidae,,Loligo,unclassified Loligo,Loligo sp.,
184220,unclassified Loligo,no rank,Loligo,6616,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Myopsina,,Loliginidae,,Loligo,unclassified Loligo,,
184221,Loligo sp. AL9407-Loligo-1K,species,unclassified Loligo,184220,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Myopsina,,Loliginidae,,Loligo,unclassified Loligo,Loligo sp. AL9407-Loligo-1K,
184222,unclassified Illex,no rank,Illex,6627,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Ommastrephidae,,Illex,unclassified Illex,,
184223,Illex sp. AL9407-Illex-2K,species,unclassified Illex,184222,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Ommastrephidae,,Illex,unclassified Illex,Illex sp. AL9407-Illex-2K,
517129,environmental samples,no rank,Berryteuthis,294703,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Gonatidae,,Berryteuthis,environmental samples,,
517130,Berryteuthis environmental sample,species,environmental samples,517129,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Oegopsina,,Gonatidae,,Berryteuthis,environmental samples,Berryteuthis environmental sample,
665457,environmental samples,no rank,Sepioteuthis,34569,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Myopsina,,Loliginidae,,Sepioteuthis,environmental samples,,
665458,Sepioteuthis environmental sample,species,environmental samples,665457,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,,Myopsina,,Loliginidae,,Sepioteuthis,environmental samples,Sepioteuthis environmental sample,
725814,unclassified Teuthida,no rank,Teuthida,551290,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Teuthida,unclassified Teuthida,,,,,,,,


#### Create complete taxonomy (that is the difference between full and no_ranks)

In [32]:
complete_taxonomy = full_taxonomy.loc[full_taxonomy.index.difference(norank_taxonomy.index)]
complete_taxonomy.dropna(axis=1, how = 'all', inplace=True) # remove now columns with all null
complete_taxonomy

Unnamed: 0,name,rank,sonof_name,sonof_id,lineage_name,lineage_id,root,sub_root,superkingdom,sub_superkingdom,...,class,subclass,infraclass,superorder,order,suborder,family,genus,species,subspecies
6615,Loliginidae,family,Myopsina,551347,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Myopsina,Loliginidae,,,
6616,Loligo,genus,Loliginidae,6615,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Myopsina,Loliginidae,Loligo,,
6618,Loligo forbesii,species,Loligo,6616,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Myopsina,Loliginidae,Loligo,Loligo forbesii,
6622,Loligo vulgaris,species,Loligo,6616,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Myopsina,Loliginidae,Loligo,Loligo vulgaris,
6623,Enoploteuthidae,family,Oegopsina,34542,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Oegopsina,Enoploteuthidae,,,
6624,Watasenia,genus,Enoploteuthidae,6623,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Oegopsina,Enoploteuthidae,Watasenia,,
6625,Watasenia scintillans,species,Watasenia,6624,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Oegopsina,Enoploteuthidae,Watasenia,Watasenia scintillans,
6626,Ommastrephidae,family,Oegopsina,34542,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Oegopsina,Ommastrephidae,,,
6627,Illex,genus,Ommastrephidae,6626,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Oegopsina,Ommastrephidae,Illex,,
6628,Illex argentinus,species,Illex,6627,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,root,cellular organisms,Eukaryota,Opisthokonta,...,Cephalopoda,Coleoidea,Neocoleoidea,Decapodiformes,Teuthida,Oegopsina,Ommastrephidae,Illex,Illex argentinus,


### Save all dataframes

In [28]:
full_taxonomy.to_csv("taxonomy_full.csv", index_label = 'taxid')
norank_taxonomy.to_csv("taxonomy_norank.csv", index_label = 'taxid')
complete_taxonomy.to_csv("taxonomy_complete.csv", index_label = 'taxid')

#### Remove lineage common to all entries (i.e. until Teuthida included)

In [None]:
#common_lineage_to_remove = r"root//.*//" + organism
#data.replace(to_replace = common_lineage_to_remove,
#             value = "", inplace = True, regex = True)
#data.head()

In [None]:
# Manually add Theuthida order - NOT needed anymore, it is already present
# organism_taxid = 551290
# data.loc[organism_taxid] = ["Teuthida", "order", 
#                             "Teuthida", # lineage_name
#                             "1//131567//2759//33154//33208//6072//33213//33317//1206795//6447//6605//6606//215449//215450//551290",
#                            None, None]
# data.tail()

In [None]:
list_of_series = [data.index, data.index]
df = pd.DataFrame(list_of_series, columns=cols)

Create dataframe of lineage of taxonomy ranks for each taxid<br>That is (taxid:"279107", rank_lineage: "order//suborder//family//genus//species")

In [None]:
id_taxidLineage = data.lineage_id
id_taxidLineage.head()

In [None]:
# Root the lineage starting from the organism of interest
id_taxidLineage = str(organism_taxid) + id_taxidLineage.str.split(str(organism_taxid), expand=True)[1].astype(str)
id_taxidLineage.head()

In [None]:
#id_rankorder = data.rank # rank is a function of dataframes
id_rankorder = data['rank']
id_rankorder.head()

In [None]:
id_rankLineage = pd.Series()
for idx, lineage_list in id_taxidLineage.str.split("//").iteritems():
    rank_list = []
    for lin_id in lineage_list:
        lin_rank = id_rankorder[int(lin_id)]
        rank_list.append(lin_rank)
    id_rankLineage[str(idx)] = rank_list
id_rankLineage.head()

In [None]:
rank_lin_df = id_rankLineage.to_frame(name = "rank_lineage")
rank_lin_df

In [None]:
rank_lin_df = rank_lin_df.assign(rank_lineage = lambda x: x.rank_lineage.str.join("//"))
rank_lin_df.head()

In [None]:
# Merge original dataframe to the new one with lineage rank
rank_lin_df.index = rank_lin_df.index.map(int)
df = data.join(rank_lin_df)
df.rename(columns = {"rank_lineage": "lineage_rank"}, inplace = True)
df = df[['name', 'rank', 'lineage_name', 'lineage_rank', 'lineage_id', 'sonof_id', 'sonof_name']] # reorder columns
df.head()

In [None]:
#df.to_csv(path_or_buf = 'taxonomy_teuthida.csv', index_label = 'taxid')