# NCBI Genes
**[Work in progress]**

https://ftp.ncbi.nlm.nih.gov/gene/README

https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz

https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz


Data source: [NCBI](https://www.ncbi.nlm.nih.gov)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path
from functools import reduce

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
# Path will take care of handling operating system differences.
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/')
CACHE = os.getenv('KGBIO_CACHE', default='../cache')

In [4]:
os.makedirs(os.path.join(CACHE), exist_ok=True)

In [5]:
url = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz'

In [6]:
# url = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz' # ImportError: Missing optional dependency 'fsspec'.  Use pip or conda to install fsspec.

In [7]:
import shutil
import requests

def download_file(url, folder_name):
    local_filename = url.split('/')[-1]
    path = os.path.join("{}/{}".format(folder_name, local_filename))
    with requests.get(url, stream=True) as r:
        with open(path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

In [8]:
download_file(url, CACHE)

'gene_info.gz'

### Import NCBI Taxonomy Names

In [9]:
column_names = ['#tax_id', 'GeneID', 'Symbol', 'Synonyms', 'description', 'type_of_gene', 'Symbol_from_nomenclature_authority']

In [10]:
import dask.dataframe as dd
genes = dd.read_csv(os.path.join(CACHE, 'gene_info.gz'), usecols=column_names, sep='\t', dtype=str)

Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  "Setting ``blocksize=None``" % compression


In [12]:
#genes = genes.repartition(partition_size="100MB") # generates of 300 partitions (< 10 mb) ???
genes = genes.repartition(npartitions=8)

In [13]:
genes.to_parquet(os.path.join(CACHE, 'gene_info.parquet'), write_index=False)

(None,)

In [14]:
# https://mungingdata.com/dask/read-csv-to-parquet/

In [15]:
#genes = pd.read_csv(os.path.join(CACHE, 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz'), dtype=str, sep='\t')

In [16]:
column_names = ['#tax_id', 'GeneID', 'Symbol', 'Synonyms', 'description', 'type_of_gene', 'Symbol_from_nomenclature_authority']

In [17]:
#genes = pd.read_csv(os.path.join(CACHE, 'gene_info.gz'), usecols=column_names, dtype=str, sep='\t')

In [18]:
genes.head()

Unnamed: 0,#tax_id,GeneID,Symbol,Synonyms,description,type_of_gene,Symbol_from_nomenclature_authority
0,7,5692769,NEWENTRY,-,Record to support submission of GeneRIFs for a...,other,-
1,9,2827857,NEWENTRY,-,Record to support submission of GeneRIFs for a...,other,-
2,11,10823747,NEWENTRY,-,Record to support submission of GeneRIFs for a...,other,-
3,14,6951813,NEWENTRY,-,Record to support submission of GeneRIFs for a...,other,-
4,19,3758873,NEWENTRY,-,Record to support submission of GeneRIFs for a...,other,-


In [19]:
genes.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 7 entries, #tax_id to Symbol_from_nomenclature_authority
dtypes: object(7)

In [20]:
genes.query('GeneID == "59272"') #GeneID for ACE2

Unnamed: 0_level_0,#tax_id,GeneID,Symbol,Synonyms,description,type_of_gene,Symbol_from_nomenclature_authority
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,object,object,object,object,object,object,object
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [21]:
genes.compute()


KeyboardInterrupt

