## Evidence Pipeline Test Data Generator

Generates input data for evidence preparation pipeline testing

NOTE: The localhost should have access to a full or nearly full data subset for this process

Process:
- choose genes to extract data for
- load raw evidence objects (subset of downloads from GS)
- load gene objects dumped from ES (extract/gene.json)
- subset evidence and genes to target list
- save the evidence records and the corresponding gene objects

In [1]:
import $file.^.testgeninit, testgeninit._
import $file.^.sparkinit, sparkinit._
import $file.^.pathinit, pathinit._
import $file.^.cpinit, cpinit._
import ss.implicits._
import org.apache.spark.sql.functions._
import java.nio.file.Paths
import com.relatedsciences.opentargets.etl.pipeline.SparkImplicits._

Loading spark-stubs
Creating SparkSession


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/12/11 19:35:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[32mimport [39m[36m$file.$            , testgeninit._
[39m
[32mimport [39m[36m$file.$          , sparkinit._
[39m
[32mimport [39m[36m$file.$         , pathinit._
[39m
[32mimport [39m[36m$file.$       , cpinit._
[39m
[32mimport [39m[36mss.implicits._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36mjava.nio.file.Paths
[39m
[32mimport [39m[36mcom.relatedsciences.opentargets.etl.pipeline.SparkImplicits._[39m

In [2]:
lazy val GENE_MAP_1 = Map(
  // Non-reference genes
  "ENSG00000223532" -> "ENSG00000234745", // HLA-B
  "ENSG00000225845" -> "ENSG00000204290", // BTNL2
  // UniProt entries
  "P35354" -> "ENSG00000073756",
  "P10275" -> "ENSG00000169083"
)


[36mGENE_MAP_1[39m: [32mMap[39m[[32mString[39m, [32mString[39m] = [32m[lazy][39m

In [3]:
// Initialize all parameters utilitzed
val extractDir = EXTRACT_DIR
val testInputDir = TEST_PIPELINE_DIR.resolve("input")
// Map[k,v] where k is raw target id and v is normalized ensembl id
val genes = // from testgeninit.sc
    // Common genes (w/ standard ids)
    GENE_SET_1.zip(GENE_SET_1).toMap ++ 
    // Genes w/ non-standard ids
    GENE_MAP_1 

[36mextractDir[39m: [32mjava[39m.[32mnio[39m.[32mfile[39m.[32mPath[39m = /home/eczech/data/ot/extract
[36mtestInputDir[39m: [32mjava[39m.[32mnio[39m.[32mfile[39m.[32mPath[39m = /home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input
[36mgenes[39m: [32mMap[39m[[32mString[39m, [32mString[39m] = [33mMap[39m(
  [32m"P10275"[39m -> [32m"ENSG00000169083"[39m,
  [32m"ENSG00000223532"[39m -> [32m"ENSG00000234745"[39m,
  [32m"ENSG00000105397"[39m -> [32m"ENSG00000105397"[39m,
  [32m"P35354"[39m -> [32m"ENSG00000073756"[39m,
  [32m"ENSG00000141510"[39m -> [32m"ENSG00000141510"[39m,
  [32m"ENSG00000169174"[39m -> [32m"ENSG00000169174"[39m,
  [32m"ENSG00000225845"[39m -> [32m"ENSG00000204290"[39m
)

## Load

### Raw Evidence

In [4]:
// Load raw evidence extract 
val dfe = ss.read.json(extractDir.resolve("evidence_raw.json").toString)
    .filter(
        element_at(split($"target.id", "/"), -1).isin(genes.keys.toSeq: _*)
    )
dfe.count()

19/12/11 19:35:42 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


[36mdfe[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [_corrupt_record: string, access_level: string ... 10 more fields]
[36mres3_1[39m: [32mLong[39m = [32m15424L[39m

In [5]:
dfe.groupBy("target.id").count().show(100, false)

+----------------------------------------------+-----+
|id                                            |count|
+----------------------------------------------+-----+
|http://identifiers.org/ensembl/ENSG00000141510|7169 |
|http://identifiers.org/ensembl/ENSG00000169174|241  |
|http://identifiers.org/ensembl/ENSG00000105397|211  |
|http://identifiers.org/uniprot/P35354         |6850 |
|http://identifiers.org/uniprot/P10275         |870  |
|http://identifiers.org/ensembl/ENSG00000223532|54   |
|http://identifiers.org/ensembl/ENSG00000225845|29   |
+----------------------------------------------+-----+



In [23]:
import org.apache.spark.sql.expressions.Window

// Take the first 10 evidence records for each target + data_type combination
val dfes = dfe
    .withColumn(
        "rid",
        row_number().over(
            Window
            .partitionBy("target.id", "type")
            .orderBy($"disease.id", $"evidence.date_asserted")
        )
    )
    .filter($"rid" <= 10)
    .drop("rid")
    .cache()
dfes.groupBy("target.id", "type").count().show(10, false)

+----------------------------------------------+-------------------+-----+
|id                                            |type               |count|
+----------------------------------------------+-------------------+-----+
|http://identifiers.org/ensembl/ENSG00000105397|affected_pathway   |10   |
|http://identifiers.org/ensembl/ENSG00000105397|animal_model       |10   |
|http://identifiers.org/ensembl/ENSG00000105397|genetic_association|10   |
|http://identifiers.org/ensembl/ENSG00000105397|genetic_literature |2    |
|http://identifiers.org/ensembl/ENSG00000105397|rna_expression     |1    |
|http://identifiers.org/ensembl/ENSG00000141510|affected_pathway   |10   |
|http://identifiers.org/ensembl/ENSG00000141510|animal_model       |10   |
|http://identifiers.org/ensembl/ENSG00000141510|genetic_association|10   |
|http://identifiers.org/ensembl/ENSG00000141510|genetic_literature |10   |
|http://identifiers.org/ensembl/ENSG00000141510|rna_expression     |10   |
+------------------------

[32mimport [39m[36morg.apache.spark.sql.expressions.Window

// Take the first 10 evidence records for each target + data_type combination
[39m
[36mdfes[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [_corrupt_record: string, access_level: string ... 10 more fields]

In [64]:
assert(dfes.schema.toString == dfe.schema.toString)

In [65]:
// Make sure this result is of suitable size for application in testing
dfes.count()

[36mres64[39m: [32mLong[39m = [32m191L[39m

### Gene Index

In [26]:
// Load gene data ES extract
val dfg = ss.read.json(extractDir.resolve("gene.json").toString)
    .filter($"ensembl_gene_id".isin(genes.values.toSeq: _*))
    .cache()
dfg.count()

19/12/11 19:46:26 WARN CacheManager: Asked to cache already cached data.


[36mdfg[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [_private: struct<facets: struct<reactome: struct<pathway_code: array<string>, pathway_type_code: array<string>>>, suggestions: struct<input: array<string>, output: string ... 1 more field>>, alias_name: array<string> ... 60 more fields]
[36mres25_1[39m: [32mLong[39m = [32m7L[39m

### EFO

In [73]:
def efo_codes = dfes.select("disease.id").distinct.collect().toSeq.map(_(0))

defined [32mfunction[39m [36mefo_codes[39m

In [75]:
// Load gene data ES extract
val dfefo = ss.read.json(extractDir.resolve("efo.json").toString)
    .filter($"code".isin(efo_codes:_*))
    .cache()
dfefo.select("code").show(10, false)

+--------------------------------------------+
|code                                        |
+--------------------------------------------+
|http://www.orpha.net/ORDO/Orphanet_110      |
|http://www.ebi.ac.uk/efo/EFO_0004267        |
|http://www.orpha.net/ORDO/Orphanet_754      |
|http://www.orpha.net/ORDO/Orphanet_90797    |
|http://www.ebi.ac.uk/efo/EFO_0000180        |
|http://www.orpha.net/ORDO/Orphanet_481      |
|http://www.ebi.ac.uk/efo/EFO_0000305        |
|http://www.ebi.ac.uk/efo/EFO_1001422        |
|http://purl.obolibrary.org/obo/MONDO_0008734|
|http://www.orpha.net/ORDO/Orphanet_99429    |
+--------------------------------------------+
only showing top 10 rows



[36mdfefo[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [children: array<struct<code:string,label:string>>, code: string ... 10 more fields]

In [77]:
(dfefo.count(), efo_codes.size)

[36mres76[39m: ([32mLong[39m, [32mInt[39m) = ([32m85L[39m, [32m92[39m)

### Export

In [29]:
var path = testInputDir.resolve("gene_ese.json").toString
dfg.coalesce(1).write
    .format("json")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(path)
println(s"Saved ${dfg.count()} gene records to '$path'")

Saved 7 gene records to '/home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/gene_ese.json'


In [78]:
var path = testInputDir.resolve("efo_ese.json").toString
dfefo.coalesce(1).write
    .format("json")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(path)
println(s"Saved ${dfefo.count()} efo records to '$path'")

Saved 85 efo records to '/home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/efo_ese.json'


In [66]:
var path = testInputDir.resolve("evidence_raw.json").toString
dfes.coalesce(1).write
    .format("json")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(path)
println(s"Saved ${dfes.count()} evidence records to '$path'")

Saved 191 evidence records to '/home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/evidence_raw.json'
