In [1]:
import os
import pandas as pd
import time

If LOCAL_SCRATCH_DIR environment variable is not set, this notebook accesses the ../data directory for temporary files.

In [2]:
DATA_DIR = os.getenv("LOCAL_SCRATCH_DIR", default="../data")
#filename = os.path.join(DATA_DIR, "gene_info.tsv")
filename = os.path.join(DATA_DIR, "gene_info.parquet")
print("Filename:", filename)
file_size = os.path.getsize(filename)
print(f"File Size: {file_size/1E9:.1f} GB")

Filename: ../data/gene_info.parquet
File Size: 0.0 GB


In [3]:
start = time.time()

In [4]:
column_names = ["GeneID", "Symbol", "Synonyms", "description", "type_of_gene", "#tax_id", "chromosome"]

#genes = pd.read_csv(filename, usecols=column_names, dtype={'tax_id': int, 'GeneID': int, 'type_of_gene': 'category'}, sep="\t")
#genes = pd.read_csv(filename, usecols=column_names, dtype=str, sep="\t")
genes = pd.read_parquet(filename, columns=column_names)
genes.rename(columns={"#tax_id": "tax_id"}, inplace=True)

In [5]:
# Pandas memory usage
#genes.memory_usage(deep=True)

In [6]:
print(f"Total memory: {genes.memory_usage(deep=True).sum()/1E9} GB")

Total memory: 18.278759474 GB


In [7]:
genes.query("type_of_gene == 'protein-coding'", inplace=True)

In [8]:
genes.head()

Unnamed: 0_level_0,GeneID,Symbol,Synonyms,description,type_of_gene,tax_id,chromosome
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,72485293,dnaA,MZ182_00005,chromosomal replication initiator protein DnaA,protein-coding,24,-
7,72485294,dnaN,MZ182_00010,DNA polymerase III subunit beta,protein-coding,24,-
8,72485295,recF,MZ182_00015,DNA replication/repair protein RecF,protein-coding,24,-
9,72485296,gyrB,MZ182_00020,DNA topoisomerase (ATP-hydrolyzing) subunit B,protein-coding,24,-
10,72485297,MZ182_RS00025,MZ182_00025,HDOD domain-containing protein,protein-coding,24,-


In [9]:
groups = genes.groupby(["tax_id"]).size().reset_index(name="count")

In [10]:
groups = groups.sort_values("count", ascending=False)

### Number of human protein-coding genes (tax_id = 9606)

In [11]:
groups.query("tax_id == '9606'")

Unnamed: 0,tax_id,count
35945,9606,20597


### Top 5 organisms with the most protein-coding genes

In [12]:
groups.head()

Unnamed: 0,tax_id,count
27772,4565,104033
25343,3708,96995
35274,90675,82686
35766,94328,68154
25056,3635,67632


In [13]:
end = time.time()

In [14]:
print(f"Pandas: {end - start:.1f} sec.")

Pandas: 95.1 sec.
