In [1]:
import os
import pandas as pd
import time

If LOCAL_SCRATCH_DIR environment variable is not set, this notebook accesses the ../data directory for temporary files.

In [2]:
DATA_DIR = os.getenv("LOCAL_SCRATCH_DIR", default="../data")
filename = os.path.join(DATA_DIR, "gene_info.tsv")
print("Filename:", filename)
file_size = os.path.getsize(filename)
print(f"File Size: {file_size/1E9:.1f} GB")

Filename: ../data/gene_info.tsv
File Size: 5.4 GB


In [3]:
start = time.time()

In [4]:
column_names = ["GeneID", "Symbol", "Synonyms", "description", "type_of_gene", "#tax_id"]

#genes = pd.read_csv(filename, usecols=column_names, dtype={'tax_id': int, 'GeneID': int, 'type_of_gene': 'category'}, sep="\t")
genes = pd.read_csv(filename, usecols=column_names, dtype=str, sep="\t")
genes.rename(columns={"#tax_id": "tax_id"}, inplace=True)

In [5]:
# Pandas memory usage
#genes.memory_usage(deep=True)

In [6]:
print(f"Total memory: {genes.memory_usage(deep=True).sum()/1E9} GB")

Total memory: 15.613361037 GB


In [7]:
genes.query("type_of_gene == 'protein-coding'", inplace=True)

In [None]:
groups = genes.groupby("tax_id").size().reset_index(name="count")

In [None]:
groups = groups.sort_values("count", ascending=False)

### Number of human protein-coding genes (tax_id = 9606)

In [None]:
groups.query("tax_id == '9606'")

### Top 5 organisms with the most protein-coding genes

In [None]:
groups.head()

In [None]:
end = time.time()

In [None]:
print(f"Pandas: {end - start:.1f} sec.")