# NCBI Taxonomy
**[Work in progress]**

This notebook downloads the NCBI taxonomy, including the taxonomy id, scientific name, and synonymes.

Bacteria, Invertebrates, Phages, Plants and Fungi, and Synthetic and Chimeric are currently excluded.

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7408187/

Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [4]:
import os
import pandas as pd
from pathlib import Path
from functools import reduce
from ftplib import FTP

In [5]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [6]:
# Path will take care of handling operating system differences.
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [7]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

### Division ids

|divisionId|division|
|---------:|--------|
|0|Bacteria|
|1|Invertebrates
|2|Mammals
|3|Phages
|4|Plants and Fungi
|5|Primates
|6|Rodents
|7|Synthetic and Chimeric
|8|Unassigned
|9|Viruses
|10|Vertebrates
|11|Environmental samples

In [8]:
# Divisions from Taxonomy tree to be included in data files
# 8: unassgined division contains root node!
DIVISION_IDS = ['0', '1', '2', '3', '4', '5','6', '7','8','9','10', '11']

## Download NCBI Taxonomy Files

In [14]:
url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.tar.gz'
FTP_DIR = 'pub/taxonomy'
SERVER = 'ftp.ncbi.nih.gov'

In [15]:
ftp = FTP(SERVER)
ftp.login()
ftp.cwd('pub/taxonomy')
# #ftp.retrlines('LIST') 
# #ftp.nlst()

'250 CWD command successful'

In [11]:
filenames = ['taxdump.tar.gz']

for filename in filenames:
    local_filename = os.path.join('../cache/', filename)
    with open(local_filename, 'wb') as f:
         ftp.retrbinary('RETR %s' % filename, f.write)
            
ftp.quit()

'221 Goodbye.'

In [13]:
import tarfile
tar = tarfile.open(os.path.join('../cache/NcbiTaxonomy/2022-04-03-230442/', 'taxdump.tar.gz'))

#tar = tarfile.open('../cache/NcbiTaxonomy/2022-04-03-230442/taxdmp.tar.gz')
tar.extractall('../cache/NcbiTaxonomy/2022-04-03-230442/')
tar.close()

FileNotFoundError: [Errno 2] No such file or directory: '../cache/NcbiTaxonomy/2022-04-03-230442/taxdump.tar.gz'

### Import NCBI Taxonomy Names

In [None]:
columns = ['id', 'name', 'nameCategory']

In [None]:
names = pd.read_csv(os.path.join(CACHE, 'names.dmp'), sep='\t\|\t', engine='python', 
                    usecols=[0,1,3], names=columns, header=None, dtype='str')

In [None]:
names['nameCategory'] = names['nameCategory'].str.replace('\t\|', '', regex=True)
names.fillna('', inplace=True)

In [None]:
print('Number of taxonomyIds:', len(names['id'].unique()))

In [None]:
print('Number of taxonomyIds:', names['nameCategory'].unique())

In [None]:
sci_name = names.query("nameCategory == 'scientific name'").copy()
sci_name.rename(columns={'name': 'scientificName'}, inplace=True)
sci_name = sci_name[['id', 'scientificName']]

In [None]:
print('scientific names:', sci_name.shape[0])

In [None]:
names1 = names.merge(sci_name, on='id', how='left')

In [None]:
names1.head()

In [None]:
names2 = names1.groupby(['id', 'scientificName'])['name'].apply(list).reset_index(name='synonyms')

In [None]:
names2.head()

In [None]:
names2['name'] = names2['scientificName']

In [None]:
names2['id'] = 'taxonomy:' + names2['id']

In [None]:
names2['synonyms'] = names2['synonyms'].apply(lambda x: '|'.join(x))

In [None]:
names2.head(10)

### Import NCBI Taxonomy Nodes

In [None]:
node_columns = ['id', 'parentId', 'rank', 'divisionId']

In [None]:
nodes = pd.read_csv(os.path.join(CACHE, 'nodes.dmp'), sep='\t\|\t', engine='python', 
                    usecols=[0,1,2,4], names=node_columns, header=None, dtype='str')

In [None]:
print('Number of relationships:', nodes.shape[0])

In [None]:
nodes.head()

In [None]:
division_columns = ['divisionId', 'division']

In [None]:
divisions = pd.read_csv(os.path.join(CACHE,'division.dmp'), sep='\t\|\t', engine='python', 
                    usecols=[0,2], names=division_columns, header=None, dtype='str')

In [None]:
divisions.head(20)

In [None]:
nodes = nodes.merge(divisions, on='divisionId', how='left')

In [None]:
nodes.shape

In [None]:
nodes.head()

##### Restrict taxonomies to a selected set of divisions

In [None]:
#nodes = nodes[nodes['divisionId'].isin(DIVISION_IDS)]

In [None]:
nodes['id'] = 'taxonomy:' + nodes['id']
nodes['parentId'] = 'taxonomy:' + nodes['parentId']

In [None]:
nodes.shape

In [None]:
nodes = nodes.merge(names2, on='id')

In [None]:
nodes.shape

## Setup Output Directories

In [None]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [None]:
os.makedirs(os.path.join(NODE_DATA), exist_ok=True)
os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [None]:
nodes[['id','name', 'synonyms', 'scientificName', 'division', 'rank']].to_csv(os.path.join(NODE_DATA, 'Organism_ncbi_2022-03-16.csv'), index=False)

In [None]:
print('Number of nodes', nodes.shape[0])

In [None]:
relationships = nodes[['id', 'parentId']].copy()
relationships.rename(columns={'id': 'from', 'parentId': 'to'}, inplace=True)
relationships.drop_duplicates(inplace=True)
relationships = relationships[(relationships['from'] != 'taxonomy:1')]

In [None]:
relationships.query('to == "taxonomy:1"')

In [None]:
print('Number of relationships', relationships.shape[0])

In [None]:
relationships.to_csv(os.path.join(RELATIONSHIP_DATA, 'Organism-IS_A-Organism_ncbi_2022-03-16.csv'), index=False)