In [1]:
from ete2 import NCBITaxa
from Bio import Entrez
import pandas as pd
import json
import re

In [2]:
ncbi = NCBITaxa()

Update taxonomy database, might takes a few minutes...

In [3]:
#ncbi.update_taxonomy_database()

#### Insert the root of the taxonomy to start retrieving information from

In [4]:
organism = "Teuthida"

In [5]:
taxid2name = ncbi.get_name_translator([organism])
taxid2name

{'Teuthida': [551290]}

Available methods
- NCBITaxa.get_rank()
- NCBITaxa.get_lineage()
- NCBITaxa.get_taxid_translator()
- NCBITaxa.get_name_translator()
- NCBITaxa.translate_to_names()

In [6]:
descendants = ncbi.get_descendant_taxa(organism, intermediate_nodes=True)
print ncbi.translate_to_names(descendants)

[u'Loligo sp.', u'environmental samples', u'Dosidicus gigas environmental sample', u'Loliolus', u'Teuthida sp. BOLD:AAB0336', u'Teuthida sp. BOLD:AAD8239', u'Loliolus uyii', u'Sthenoteuthis sp. 1 SS-2016', u'Sthenoteuthis sp. 2 SS-2016', u'Sthenoteuthis sp. 3 SS-2016', u'Sthenoteuthis sp. 4 SS-2016', u'Uroteuthis', u'Filippovia', u'Uroteuthis edulis', u'Architeuthis dux', u'Dosidicus gigas', u'Moroteuthis ingens', u'Todarodes filippovae', u'Sepioteuthis cf. lessoniana ACEH12MAY04', u'Bathyteuthis sp. RJ-2009', u'Pyroteuthis sp. RJ-2009', u'Helicocranchia sp. RJ-2009', u'Histioteuthis sp. RJ-2009', u'Leachia sp. RJ-2009', u'Helicocranchia', u'Todarodes sagittatus', u'Histioteuthis hoylei', u'Galiteuthis sp. Gali6', u'Ommastrephes bartramii', u'Sepioteuthis australis', u'Ancistrocheiridae', u'Ancistrocheirus', u'Helicocranchia pfefferi', u'Cycloteuthidae', u'Discoteuthis', u'Discoteuthis laciniosa', u'Bathyteuthidae', u'Bathyteuthis', u'Bathyteuthis abyssicola', u'Chiroteuthidae', u'Chir

In [7]:
len(descendants)

614

In [8]:
ranks = ncbi.get_rank(descendants)
ranks

{6615: u'family',
 6616: u'genus',
 6618: u'species',
 6622: u'species',
 6623: u'family',
 6624: u'genus',
 6625: u'species',
 6626: u'family',
 6627: u'genus',
 6628: u'species',
 6629: u'species',
 6630: u'genus',
 6631: u'species',
 6632: u'genus',
 6636: u'genus',
 6637: u'species',
 34542: u'suborder',
 34543: u'genus',
 34544: u'species',
 34545: u'genus',
 34546: u'species',
 34547: u'genus',
 34548: u'species',
 34549: u'genus',
 34550: u'species',
 34551: u'species',
 34552: u'genus',
 34553: u'species',
 34554: u'family',
 34555: u'genus',
 34556: u'species',
 34557: u'family',
 34558: u'genus',
 34559: u'species',
 34560: u'family',
 34561: u'genus',
 34562: u'species',
 34563: u'genus',
 34564: u'species',
 34565: u'family',
 34566: u'genus',
 34567: u'species',
 34569: u'genus',
 34570: u'species',
 51370: u'species',
 51371: u'species',
 54069: u'species',
 55283: u'species',
 55284: u'genus',
 55285: u'species',
 55716: u'species',
 55717: u'genus',
 55718: u'species',


In [9]:
df = {}
for taxid in descendants:
    df[taxid] = {}
    
    specie = ncbi.translate_to_names([taxid])
    rank_dict = ncbi.get_rank([taxid])
    lineage_id = ncbi.get_lineage(taxid)
    names = ncbi.get_taxid_translator(lineage_id)
    lineage_name = [names[taxid] for taxid in lineage_id]
    
    df[taxid]['name'] = specie[0]
    df[taxid]['rank'] = rank_dict[taxid]
    df[taxid]['lineage_id'] = '//'.join([str(char) for char in lineage_id])
    df[taxid]['lineage_name'] = '//'.join(lineage_name)

In [10]:
print(json.dumps(df, indent = 2))

{
  "1091609": {
    "lineage_id": "1//131567//2759//33154//33208//6072//33213//33317//1206795//6447//6605//6606//215449//215450//551290//34542//6626//346248//1091609", 
    "name": "environmental samples", 
    "rank": "no rank", 
    "lineage_name": "root//cellular organisms//Eukaryota//Opisthokonta//Metazoa//Eumetazoa//Bilateria//Protostomia//Lophotrochozoa//Mollusca//Cephalopoda//Coleoidea//Neocoleoidea//Decapodiformes//Teuthida//Oegopsina//Ommastrephidae//Dosidicus//environmental samples"
  }, 
  "1091610": {
    "lineage_id": "1//131567//2759//33154//33208//6072//33213//33317//1206795//6447//6605//6606//215449//215450//551290//34542//6626//346248//1091609//1091610", 
    "name": "Dosidicus gigas environmental sample", 
    "rank": "species", 
    "lineage_name": "root//cellular organisms//Eukaryota//Opisthokonta//Metazoa//Eumetazoa//Bilateria//Protostomia//Lophotrochozoa//Mollusca//Cephalopoda//Coleoidea//Neocoleoidea//Decapodiformes//Teuthida//Oegopsina//Ommastrephidae//Dosidicu

In [11]:
data = pd.DataFrame.from_dict(data=df, orient="index")
print(data.head())
data = data[['name', 'rank', 'lineage_name', 'lineage_id']] # reorder columns
print(data.columns)

                                             lineage_id             name  \
6615  1//131567//2759//33154//33208//6072//33213//33...      Loliginidae   
6616  1//131567//2759//33154//33208//6072//33213//33...           Loligo   
6618  1//131567//2759//33154//33208//6072//33213//33...  Loligo forbesii   
6622  1//131567//2759//33154//33208//6072//33213//33...  Loligo vulgaris   
6623  1//131567//2759//33154//33208//6072//33213//33...  Enoploteuthidae   

         rank                                       lineage_name  
6615   family  root//cellular organisms//Eukaryota//Opisthoko...  
6616    genus  root//cellular organisms//Eukaryota//Opisthoko...  
6618  species  root//cellular organisms//Eukaryota//Opisthoko...  
6622  species  root//cellular organisms//Eukaryota//Opisthoko...  
6623   family  root//cellular organisms//Eukaryota//Opisthoko...  
Index([u'name', u'rank', u'lineage_name', u'lineage_id'], dtype='object')


In [12]:
#data.to_csv(path_or_buf = 'taxonomy_teuthide.csv', index_label='taxid')

In [13]:
data['sonof_id'] = None
data['sonof_name'] = None
for index, row in data.iterrows():
    row['sonof_id'] = row['lineage_id'].split('//')[-2] # take father node
    row['sonof_name'] = row['lineage_name'].split('//')[-2] # take father node    
    #row['son_of_(rank_name)'] = data[index, 'son_of']
data.head()

Unnamed: 0,name,rank,lineage_name,lineage_id,sonof_id,sonof_name
6615,Loliginidae,family,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,551347,Myopsina
6616,Loligo,genus,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,6615,Loliginidae
6618,Loligo forbesii,species,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,6616,Loligo
6622,Loligo vulgaris,species,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,6616,Loligo
6623,Enoploteuthidae,family,root//cellular organisms//Eukaryota//Opisthoko...,1//131567//2759//33154//33208//6072//33213//33...,34542,Oegopsina


#### Remove lineage common to all entries (i.e. until Teuthida included)

In [14]:
common_lineage_to_remove = r"root//.*//" + organism
data.replace(to_replace = common_lineage_to_remove,
             value = "", inplace = True, regex = True)
data.head()

Unnamed: 0,name,rank,lineage_name,lineage_id,sonof_id,sonof_name
6615,Loliginidae,family,//Myopsina//Loliginidae,1//131567//2759//33154//33208//6072//33213//33...,551347,Myopsina
6616,Loligo,genus,//Myopsina//Loliginidae//Loligo,1//131567//2759//33154//33208//6072//33213//33...,6615,Loliginidae
6618,Loligo forbesii,species,//Myopsina//Loliginidae//Loligo//Loligo forbesii,1//131567//2759//33154//33208//6072//33213//33...,6616,Loligo
6622,Loligo vulgaris,species,//Myopsina//Loliginidae//Loligo//Loligo vulgaris,1//131567//2759//33154//33208//6072//33213//33...,6616,Loligo
6623,Enoploteuthidae,family,//Oegopsina//Enoploteuthidae,1//131567//2759//33154//33208//6072//33213//33...,34542,Oegopsina


In [15]:
# Manually add Theuthida order
organism_taxid = 551290
data.loc[organism_taxid] = ["Teuthida", "order", 
                            "Teuthida", # lineage_name
                            "1//131567//2759//33154//33208//6072//33213//33317//1206795//6447//6605//6606//215449//215450//551290",
                           None, None]
data.tail()

Unnamed: 0,name,rank,lineage_name,lineage_id,sonof_id,sonof_name
2502176,Onychoteuthis cf. banksii HJ-2019,species,//Oegopsina//Onychoteuthidae//Onychoteuthis//O...,1//131567//2759//33154//33208//6072//33213//33...,61726.0,Onychoteuthis
2502177,Helicocranchia sp. A HJ-2019,species,//Oegopsina//Cranchiidae//Helicocranchia//Heli...,1//131567//2759//33154//33208//6072//33213//33...,78432.0,Helicocranchia
2502178,Liguriella podophthalma,species,//Oegopsina//Cranchiidae//Liguriella//Liguriel...,1//131567//2759//33154//33208//6072//33213//33...,2053952.0,Liguriella
2516895,Chtenopteryx sp. C CI-2019,species,//Oegopsina//Chtenopterygidae//Chtenopteryx//C...,1//131567//2759//33154//33208//6072//33213//33...,61698.0,Chtenopteryx
551290,Teuthida,order,Teuthida,1//131567//2759//33154//33208//6072//33213//33...,,


In [16]:
data.sort_values(by=['lineage_id'], inplace=True)

Create dataframe of lineage of taxonomy ranks for each taxid<br>That is (taxid:"279107", rank_lineage: "order//suborder//family//genus//species")

In [17]:
id_taxidLineage = data.lineage_id
id_taxidLineage.head()

551290     1//131567//2759//33154//33208//6072//33213//33...
1986411    1//131567//2759//33154//33208//6072//33213//33...
1986412    1//131567//2759//33154//33208//6072//33213//33...
2002786    1//131567//2759//33154//33208//6072//33213//33...
2002799    1//131567//2759//33154//33208//6072//33213//33...
Name: lineage_id, dtype: object

In [18]:
# Root the lineage starting from the organism of interest
id_taxidLineage = str(organism_taxid) + id_taxidLineage.str.split(str(organism_taxid), expand=True)[1].astype(str)
id_taxidLineage.head()

551290                       551290
1986411             551290//1986411
1986412    551290//1986411//1986412
2002786    551290//1986411//2002786
2002799    551290//1986411//2002799
Name: 1, dtype: object

In [19]:
#id_rankorder = data.rank # rank is a function of dataframes
id_rankorder = data['rank']
id_rankorder.head()

551290       order
1986411    no rank
1986412    species
2002786    species
2002799    species
Name: rank, dtype: object

In [20]:
id_rankLineage = pd.Series()
for idx, lineage_list in id_taxidLineage.str.split("//").iteritems():
    rank_list = []
    for lin_id in lineage_list:
        lin_rank = id_rankorder[int(lin_id)]
        rank_list.append(lin_rank)
    id_rankLineage[str(idx)] = rank_list
id_rankLineage.head()

551290                       [order]
1986411             [order, no rank]
1986412    [order, no rank, species]
2002786    [order, no rank, species]
2002799    [order, no rank, species]
dtype: object

In [21]:
rank_lin_df = id_rankLineage.to_frame(name = "rank_lineage")
rank_lin_df

Unnamed: 0,rank_lineage
551290,[order]
1986411,"[order, no rank]"
1986412,"[order, no rank, species]"
2002786,"[order, no rank, species]"
2002799,"[order, no rank, species]"
34542,"[order, suborder]"
283043,"[order, suborder, family]"
283044,"[order, suborder, family, genus]"
283045,"[order, suborder, family, genus, species]"
34554,"[order, suborder, family]"


In [22]:
rank_lin_df = rank_lin_df.assign(rank_lineage = lambda x: x.rank_lineage.str.join("//"))
rank_lin_df.head()

Unnamed: 0,rank_lineage
551290,order
1986411,order//no rank
1986412,order//no rank//species
2002786,order//no rank//species
2002799,order//no rank//species


In [23]:
# Merge original dataframe to the new one with lineage rank
rank_lin_df.index = rank_lin_df.index.map(int)
df = data.join(rank_lin_df)
df.rename(columns = {"rank_lineage": "lineage_rank"}, inplace = True)
df = df[['name', 'rank', 'lineage_name', 'lineage_rank', 'lineage_id', 'sonof_id', 'sonof_name']] # reorder columns
df.head()

Unnamed: 0,name,rank,lineage_name,lineage_rank,lineage_id,sonof_id,sonof_name
551290,Teuthida,order,Teuthida,order,1//131567//2759//33154//33208//6072//33213//33...,,
1986411,environmental samples,no rank,//environmental samples,order//no rank,1//131567//2759//33154//33208//6072//33213//33...,551290.0,Teuthida
1986412,Teuthida environmental sample,species,environmental sample,order//no rank//species,1//131567//2759//33154//33208//6072//33213//33...,1986411.0,environmental samples
2002786,Doryteuthis environmental sample,species,//environmental samples//Doryteuthis environme...,order//no rank//species,1//131567//2759//33154//33208//6072//33213//33...,1986411.0,environmental samples
2002799,Illex environmental sample,species,//environmental samples//Illex environmental s...,order//no rank//species,1//131567//2759//33154//33208//6072//33213//33...,1986411.0,environmental samples


In [24]:
df.to_csv(path_or_buf = 'taxonomy_teuthida.csv', index_label = 'taxid')