# NCBI Genes - Prepare
**[Work in progress]**

https://ftp.ncbi.nlm.nih.gov/gene/README

https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz

https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz


Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path
from functools import reduce

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Path will take care of handling operating system differences.
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [4]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

### Import NCBI Taxonomy Names

In [5]:
column_names = ['#tax_id', 'GeneID', 'Symbol', 'Synonyms', 'description', 'type_of_gene', 'Symbol_from_nomenclature_authority']

In [6]:
genes = pd.read_csv(os.path.join(CACHE, 'gene_info.gz'), usecols=column_names, dtype=str, sep='\t')

In [7]:
genes.rename(columns={'#tax_id': 'taxonomyId', 'GeneID': 'id', 'Symbol': 'name', 'Synonyms': 'synonyms', 'type_of_gene': 'geneType', 'Symbol_from_nomenclature_authority': 'officialSymbol'}, inplace=True)

In [8]:
genes.replace('-', '', inplace=True)

In [9]:
genes['geneType'].unique()

array(['other', 'protein-coding', 'pseudo', 'rRNA', 'tRNA', 'miscRNA',
       'ncRNA', 'unknown', 'snRNA', 'snoRNA', 'scRNA',
       'biological-region'], dtype=object)

In [10]:
genes['id'] = 'ncbigene:' + genes['id']
genes['taxonomyId'] = 'taxonomy:' +genes['taxonomyId']

## Setup Output Directories

In [11]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [12]:
os.makedirs(os.path.join(NODE_DATA), exist_ok=True)
os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [13]:
print('Number of nodes', genes.shape[0])

Number of nodes 37373059


In [17]:
genes.query("taxonomyId == 'taxonomy:9606'").head(1000)

Unnamed: 0,taxonomyId,id,name,synonyms,description,geneType,officialSymbol
9397939,taxonomy:9606,ncbigene:1,A1BG,A1B|ABG|GAB|HYST2477,alpha-1-B glycoprotein,protein-coding,A1BG
9397940,taxonomy:9606,ncbigene:2,A2M,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin,protein-coding,A2M
9397941,taxonomy:9606,ncbigene:3,A2MP1,A2MP,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1
9397942,taxonomy:9606,ncbigene:9,NAT1,AAC1|MNAT|NAT-1|NATI,N-acetyltransferase 1,protein-coding,NAT1
9397943,taxonomy:9606,ncbigene:10,NAT2,AAC2|NAT-2|PNAT,N-acetyltransferase 2,protein-coding,NAT2
9397944,taxonomy:9606,ncbigene:11,NATP,AACP|NATP1,N-acetyltransferase pseudogene,pseudo,NATP
9397945,taxonomy:9606,ncbigene:12,SERPINA3,AACT|ACT|GIG24|GIG25,serpin family A member 3,protein-coding,SERPINA3
9397946,taxonomy:9606,ncbigene:13,AADAC,CES5A1|DAC,arylacetamide deacetylase,protein-coding,AADAC
9397947,taxonomy:9606,ncbigene:14,AAMP,,angio associated migratory cell protein,protein-coding,AAMP
9397948,taxonomy:9606,ncbigene:15,AANAT,DSPS|SNAT,aralkylamine N-acetyltransferase,protein-coding,AANAT


In [None]:
genes[['id', 'name', 'synonyms', 'description', 'officialSymbol', 'geneType', 'taxonomyId']].to_parquet(os.path.join(NODE_DATA, 'Gene_ncbi-2022-03-14.parquet'), compression='brotli', index=False)

In [None]:
genes.drop(columns=['name', 'synonyms', 'description', 'officialSymbol', 'geneType'], inplace=True)

In [None]:
relationships = genes[['taxonomyId', 'id']]
relationships.rename(columns={'taxonomyId': 'from', 'id': 'to'}, inplace=True)
relationships.drop_duplicates(inplace=True)

In [None]:
relationships.head()

In [None]:
print('Number of relationships', relationships.shape[0])

In [None]:
relationships[['from', 'to']].to_parquet(os.path.join(RELATIONSHIP_DATA, 'Organism-ENCODES-Gene_ncbi-2022-03-14.parquet'), compression='brotli', index=False)