In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import col, asc,desc
import time

If LOCAL_SCRATCH_DIR environment variable is not set, this notebook accesses the ../data directory for temporary files.

In [2]:
DATA_DIR = os.getenv("LOCAL_SCRATCH_DIR", default="../data")
filename = os.path.join(DATA_DIR, "gene_info.tsv")
print("Filename:", filename)

Filename: ../data/gene_info.tsv


In [3]:
start = time.time()

In [4]:
spark = SparkSession.builder.appName("SparkDataframe").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
column_names = ["GeneID", "Symbol", "Synonyms", "description", "type_of_gene", "#tax_id"]

genes = spark.read.option("header","true").option("sep", "\t").csv(filename)
genes = genes.select(column_names)
genes = genes.withColumnRenamed("#tax_id", "tax_id")

                                                                                

In [6]:
genes = genes.filter("type_of_gene == 'protein-coding'")

In [7]:
groups = genes.groupBy(["tax_id"]).count()

In [8]:
groups = groups.toDF("tax_id", "count")

In [9]:
groups = groups.sort(col("count").desc())

Convert Spark to Pandas dataframe (this triggers the computation)

In [10]:
groups = groups.toPandas()

                                                                                

### Number of human protein-coding genes (tax_id = 9606)

In [11]:
groups.query("tax_id == '9606'")

Unnamed: 0,tax_id,count
417,9606,20596


### Top 5 organisms with the most protein-coding genes

In [12]:
groups.head()

Unnamed: 0,tax_id,count
0,4565,104033
1,3708,96995
2,90675,82686
3,94328,68154
4,3635,67632


In [13]:
spark.stop()

In [14]:
end = time.time()

In [15]:
print("Spark:", end - start, "sec.")

Spark: 51.268269062042236 sec.
