# Dev notebook

In [1]:
from ncbi_taxonomy import taxonomy

In [2]:
session = taxonomy.NCBI_Taxonomy("./data/taxdump/")
session.load_taxonomy_data()

In [5]:
session.names.head(3)

Unnamed: 0,tax_id,name_txt,unique_name,name_class
0,1,all,,synonym
1,1,root,,scientific name
2,2,Bacteria,Bacteria <bacteria>,scientific name


In [6]:
session.nodes.head(3)

Unnamed: 0_level_0,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,no rank,,8,0,1,0,0,0,0,0,
2,131567,superkingdom,,0,0,11,0,0,0,0,0,
6,335928,genus,,0,1,11,1,0,1,0,0,


### 1. get_tax_id() --> get tax_id from name

In [5]:
for i in ("MERS", "SARS"):
    print(session.get_tax_id(i))

1335626
694009


In [6]:
# Handle duplicates by prompting user for input
session.get_tax_id("Treponemataceae")

Multiple matches to query found:
     tax_id         name_txt                    unique_name name_class
598     137  Treponemataceae  Treponemataceae <spirochetes>    in-part
807     170  Treponemataceae     Treponemataceae <bacteria>    in-part
Please select tax_id:  137


137

In [7]:
session.get_tax_id("Apple")  # Raise IndexError 

IndexError: Invalid name - not found or possible typo

### 2. Create node object with TaxonomyNode class

In [8]:
nodes = []
for i in ("MERS", "SARS"):
    nodes.append(taxonomy.TaxonomyNode(session.get_tax_id(i)))

print(nodes)

[TaxonomyNode(1335626), TaxonomyNode(694009)]


### 3. get_name_txt() --> Get name_txt corresponding to tax_id

In [11]:
print(session.get_name_txt(nodes[0].tax_id, name_class="scientific name"))
print(session.get_name_txt(nodes[0].tax_id, name_class="acronym"))

Middle East respiratory syndrome-related coronavirus
MERS


### 4. get_node_info() --> get associated information for node

In [12]:
for i in range(len(nodes)):
    session.get_node_info(nodes[i])

In [13]:
print(nodes[0])

tax_id: 1335626
rank: species
name_txt: Middle East respiratory syndrome-related coronavirus
parent_txt_id: 2509494


In [14]:
print(nodes[1])

tax_id: 694009
rank: species
name_txt: Severe acute respiratory syndrome-related coronavirus
parent_txt_id: 2509511


### 5. create_node_objects() --> accept list of names and generate node objects

In [7]:
nodes = session.create_node_objects(["MERS", "SARS"])  # Input names
print(nodes[0])
print(nodes[1])

tax_id: 1335626
rank: species
name_txt: Middle East respiratory syndrome-related coronavirus
parent_txt_id: 2509494
tax_id: 694009
rank: species
name_txt: Severe acute respiratory syndrome-related coronavirus
parent_txt_id: 2509511


In [12]:
nodes = session.create_node_objects([1335626, 694009, 2509511], from_tax_id=True)  # Input tax_id
print(nodes[0])
print(nodes[1])
print(nodes[2])

tax_id: 1335626
rank: species
name_txt: Middle East respiratory syndrome-related coronavirus
parent_txt_id: 2509494
tax_id: 694009
rank: species
name_txt: Severe acute respiratory syndrome-related coronavirus
parent_txt_id: 2509511
tax_id: 2509511
rank: subgenus
name_txt: Sarbecovirus
parent_txt_id: 694002


### 6. get_LCA() --> get lowest common ancestor of select nodes

In [13]:
lca = session.get_LCA(node_list=nodes)
print(lca)

tax_id: 694002
rank: genus
name_txt: Betacoronavirus
parent_txt_id: 2501931
