In [1]:
import os
import dask.dataframe as dd
from dask.distributed import Client, progress
import time

If LOCAL_SCRATCH_DIR environment variable is not set, this notebook accesses the ../data directory for temporary files.

In [2]:
DATA_DIR = os.getenv("LOCAL_SCRATCH_DIR", default="../data")
filename = os.path.join(DATA_DIR, "gene_info.tsv")
print("Filename:", filename)

Filename: ../data/gene_info.tsv


In [3]:
start = time.time()

In [4]:
column_names = ["GeneID", "Symbol", "Synonyms", "description", "type_of_gene", "#tax_id"]
                
genes = dd.read_csv(filename, usecols=column_names, dtype=str, sep="\t", blocksize="0.25 GB")
genes = genes.rename(columns={"#tax_id": "tax_id"})

In [5]:
genes = genes.query("type_of_gene == 'protein-coding'")

In [6]:
groups = genes.groupby("tax_id").size().reset_index()

In [7]:
groups.columns = ["tax_id", "count"]

In [8]:
groups = groups.sort_values("count", ascending=False)

Convert Dask to Pandas dataframe (this triggers the computation)

In [9]:
groups = groups.compute(scheduler="processes")

### Number of human protein-coding genes (tax_id = 9606)

In [10]:
groups.query("tax_id == '9606'")

Unnamed: 0,tax_id,count
1441,9606,20596


### Top 5 organisms with the most protein-coding genes

In [11]:
groups.head()

Unnamed: 0,tax_id,count
675,4565,104033
367,3708,96995
6762,90675,82686
6914,94328,68154
342,3635,67632


In [12]:
end = time.time()

In [13]:
print(f"Dask: {end - start} sec.")

Dask: 41.873300313949585 sec.
