# NCBI Genes - Filter
**[Work in progress]**

https://ftp.ncbi.nlm.nih.gov/gene/README

https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz

https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz


Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NODE_DATA = os.getenv('NODE_DATA', default='/Users/Peter/GitRepositories/dkg_data_v0.1.2_2022-03-16/v0.1.2/nodes')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='/Users/Peter/GitRepositories/dkg_data_v0.1.2_2022-03-16/v0.1.2/relationships')
CACHED_NODE_DATA = os.getenv('KGBIO_NODE_DATA', default='../data/nodes/')
CACHED_RELATIONSHIP_DATA = os.getenv('KGBIO_RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [4]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

In [5]:
rels = pd.read_csv(os.path.join(RELATIONSHIP_DATA, 'DataSet-MENTIONS-Gene_figshare_2021-11-17.csv'), dtype=str)

In [6]:
rels.head()

Unnamed: 0,from,to
0,https://springernature.figshare.com/articles/d...,ncbigene:57192
1,https://springernature.figshare.com/articles/d...,ncbigene:56649
2,https://springernature.figshare.com/articles/d...,ncbigene:59272
3,https://tandf.figshare.com/articles/dataset/Em...,ncbigene:1803
4,https://tandf.figshare.com/articles/dataset/Em...,ncbigene:43740568


In [7]:
ids = rels[['to']].copy()
ids.rename(columns={'to': 'id'}, inplace=True)
ids.drop_duplicates(inplace=True)

In [8]:
print('Number of ids:', ids.shape[0])
ids.head()

Number of ids: 759


Unnamed: 0,id
0,ncbigene:57192
1,ncbigene:56649
2,ncbigene:59272
3,ncbigene:1803
4,ncbigene:43740568


In [9]:
genes = pd.read_parquet(os.path.join(CACHED_NODE_DATA, 'Gene_ncbi-2022-03-14.parquet'))

In [10]:
genes.sample(5)

Unnamed: 0,id,name,synonyms,description,officialSymbol,geneType,taxonomyId
10198225,ncbigene:111774539,LOC111774539,,uncharacterized LOC111774539,,ncRNA,taxonomy:9796
35918762,ncbigene:115612703,CHIC1,,cysteine rich hydrophobic domain 1,,protein-coding,taxonomy:2489341
22759637,ncbigene:101627707,FHIP2B,FAM160B2,FHF complex subunit HOOK interacting protein 2B,,protein-coding,taxonomy:143302
32978098,ncbigene:113639044,LOC113639044,,tetraspanin-31-B-like,,protein-coding,taxonomy:1234273
26517167,ncbigene:112507666,LOC112507666,,acidic leucine-rich nuclear phosphoprotein 32 ...,,protein-coding,taxonomy:309979


In [11]:
nodes_filtered = genes.merge(ids, how='inner')

In [12]:
nodes_filtered.sample(5)

Unnamed: 0,id,name,synonyms,description,officialSymbol,geneType,taxonomyId
271,ncbigene:4070,TACSTD2,EGP-1|EGP1|GA733-1|GA7331|GP50|M1S1|TROP2,tumor associated calcium signal transducer 2,TACSTD2,protein-coding,taxonomy:9606
331,ncbigene:5367,PMCH,MCH|ppMCH,pro-melanin concentrating hormone,PMCH,protein-coding,taxonomy:9606
315,ncbigene:5005,ORM2,AGP-B|AGP-B'|AGP2,orosomucoid 2,ORM2,protein-coding,taxonomy:9606
368,ncbigene:6253,RTN2,NSP2|NSPL1|NSPLI|SPG12,reticulon 2,RTN2,protein-coding,taxonomy:9606
646,ncbigene:94025,MUC16,CA125,"mucin 16, cell surface associated",MUC16,protein-coding,taxonomy:9606


In [13]:
print('Nodes filtered:', nodes_filtered.shape[0])

Nodes filtered: 754


## Setup Output Directories

In [14]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [15]:
os.makedirs(os.path.join(NODE_DATA), exist_ok=True)
os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [16]:
print('Number of nodes', nodes_filtered.shape[0])

Number of nodes 754


In [17]:
nodes_filtered.to_csv(os.path.join(NODE_DATA, 'Gene_ncbi-2022-03-14.csv'), index=False)

In [None]:
relationships = nodes_filtered[['taxonomyId', 'id']].copy()
relationships.drop_duplicates(inplace=True)
relationships.rename(columns={'taxonomyId': 'from', 'id': 'to'}, inplace=True)

In [None]:
print('Number of relationships', relationships.shape[0])

In [None]:
relationships.to_csv(os.path.join(RELATIONSHIP_DATA, 'Organism-ENCODES-Gene_ncbi-2022-03-14.csv'), index=False)