In [None]:
import os
import cudf
import time

If LOCAL_SCRATCH_DIR environment variable is not set, this notebook accesses the ../data directory for temporary files.

In [None]:
DATA_DIR = os.getenv("LOCAL_SCRATCH_DIR", default="../data")
filename = os.path.join(DATA_DIR, "gene_info.tsv")
print("Filename:", filename)

In [None]:
start = time.time()

In [None]:
column_names = ["GeneID", "Symbol", "Synonyms", "description", "type_of_gene", "#tax_id"]

genes = cudf.read_csv(filename, usecols=column_names, dtype=str, sep="\t")
gnese = genes.rename(columns={"#tax_id": "tax_id"})

In [None]:
# cudf does not support query by string
genes = genes[genes["type_of_gene"] == 'protein-coding']

In [None]:
groups = genes.groupby("tax_id").size().reset_index()

In [None]:
groups.columns = ["tax_id", "count"]

In [None]:
groups = groups.sort_values("count", ascending=False)

### Number of human protein-coding genes (tax_id = 9606)

In [None]:
groups[groups["tax_id"]  == "9606"]

### Top 5 organisms with the most protein-coding genes

In [None]:
groups.head()

In [None]:
end = time.time()

In [None]:
print(f"cuDF: {end - start} sec.")