## Evidence Pipeline Test Data Generator

Generates input data for evidence preparation pipeline testing

NOTE: The localhost should have access to a full or nearly full data subset for this process

Process:
- choose genes to extract data for
- load raw evidence objects (subset of downloads from GS)
- load gene objects dumped from ES (extract/gene.json)
- subset evidence and genes to target list
- save the evidence records and the corresponding gene objects

In [1]:
import $file.^.testgeninit, testgeninit._
import $file.^.sparkinit, sparkinit._
import $file.^.pathinit, pathinit._
import $file.^.cpinit, cpinit._
import ss.implicits._
import org.apache.spark.sql.functions._
import java.nio.file.Paths
import com.relatedsciences.opentargets.etl.pipeline.SparkImplicits._

Compiling /home/eczech/repos/ot-scoring/notebooks/testgeninit.scCompiling /home/eczech/repos/ot-scoring/notebooks/sparkinit.scLoading spark-stubs
Creating SparkSession


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/12/12 13:03:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Compiling /home/eczech/repos/ot-scoring/notebooks/pathinit.scCompiling /home/eczech/repos/ot-scoring/notebooks/cpinit.sc

[32mimport [39m[36m$file.$            , testgeninit._
[39m
[32mimport [39m[36m$file.$          , sparkinit._
[39m
[32mimport [39m[36m$file.$         , pathinit._
[39m
[32mimport [39m[36m$file.$       , cpinit._
[39m
[32mimport [39m[36mss.implicits._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._
[39m
[32mimport [39m[36mjava.nio.file.Paths
[39m
[32mimport [39m[36mcom.relatedsciences.opentargets.etl.pipeline.SparkImplicits._[39m

In [2]:
lazy val GENE_MAP_1 = Map(
  // Non-reference genes
  "ENSG00000223532" -> "ENSG00000234745", // HLA-B
  "ENSG00000225845" -> "ENSG00000204290", // BTNL2
  // UniProt entries
  "P35354" -> "ENSG00000073756",
  "P10275" -> "ENSG00000169083"
)


[36mGENE_MAP_1[39m: [32mMap[39m[[32mString[39m, [32mString[39m] = [32m[lazy][39m

In [3]:
// Initialize all parameters utilitzed
val extractDir = EXTRACT_DIR
val testInputDir = TEST_PIPELINE_DIR.resolve("input")
// Map[k,v] where k is raw target id and v is normalized ensembl id
val genes = // from testgeninit.sc
    // Common genes (w/ standard ids)
    GENE_SET_1.zip(GENE_SET_1).toMap ++ 
    // Genes w/ non-standard ids
    GENE_MAP_1 

[36mextractDir[39m: [32mjava[39m.[32mnio[39m.[32mfile[39m.[32mPath[39m = /home/eczech/data/ot/extract
[36mtestInputDir[39m: [32mjava[39m.[32mnio[39m.[32mfile[39m.[32mPath[39m = /home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input
[36mgenes[39m: [32mMap[39m[[32mString[39m, [32mString[39m] = [33mMap[39m(
  [32m"P10275"[39m -> [32m"ENSG00000169083"[39m,
  [32m"ENSG00000223532"[39m -> [32m"ENSG00000234745"[39m,
  [32m"ENSG00000105397"[39m -> [32m"ENSG00000105397"[39m,
  [32m"P35354"[39m -> [32m"ENSG00000073756"[39m,
  [32m"ENSG00000141510"[39m -> [32m"ENSG00000141510"[39m,
  [32m"ENSG00000169174"[39m -> [32m"ENSG00000169174"[39m,
  [32m"ENSG00000225845"[39m -> [32m"ENSG00000204290"[39m
)

## Load

### Raw Evidence

In [4]:
// Load raw evidence extract 
val dfe = ss.read.json(extractDir.resolve("evidence_raw.json").toString)
    .filter(
        element_at(split($"target.id", "/"), -1).isin(genes.keys.toSeq: _*)
    )
dfe.count()

19/12/12 13:03:44 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


[36mdfe[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [_corrupt_record: string, access_level: string ... 10 more fields]
[36mres3_1[39m: [32mLong[39m = [32m15424L[39m

In [5]:
dfe.groupBy("target.id").count().show(100, false)

+----------------------------------------------+-----+
|id                                            |count|
+----------------------------------------------+-----+
|http://identifiers.org/ensembl/ENSG00000141510|7169 |
|http://identifiers.org/ensembl/ENSG00000169174|241  |
|http://identifiers.org/ensembl/ENSG00000105397|211  |
|http://identifiers.org/uniprot/P35354         |6850 |
|http://identifiers.org/uniprot/P10275         |870  |
|http://identifiers.org/ensembl/ENSG00000223532|54   |
|http://identifiers.org/ensembl/ENSG00000225845|29   |
+----------------------------------------------+-----+



In [6]:
import org.apache.spark.sql.expressions.Window

// Take the first 10 evidence records for each target + data_type combination
val dfes = dfe
    .withColumn(
        "rid",
        row_number().over(
            Window
            .partitionBy("target.id", "type")
            .orderBy($"disease.id", $"evidence.date_asserted")
        )
    )
    .filter($"rid" <= 10)
    .drop("rid")
    .cache()
dfes.groupBy("target.id", "type").count().show(10, false)

+----------------------------------------------+-------------------+-----+
|id                                            |type               |count|
+----------------------------------------------+-------------------+-----+
|http://identifiers.org/ensembl/ENSG00000105397|affected_pathway   |10   |
|http://identifiers.org/ensembl/ENSG00000105397|animal_model       |10   |
|http://identifiers.org/ensembl/ENSG00000105397|genetic_association|10   |
|http://identifiers.org/ensembl/ENSG00000105397|genetic_literature |2    |
|http://identifiers.org/ensembl/ENSG00000105397|rna_expression     |1    |
|http://identifiers.org/ensembl/ENSG00000141510|affected_pathway   |10   |
|http://identifiers.org/ensembl/ENSG00000141510|animal_model       |10   |
|http://identifiers.org/ensembl/ENSG00000141510|genetic_association|10   |
|http://identifiers.org/ensembl/ENSG00000141510|genetic_literature |10   |
|http://identifiers.org/ensembl/ENSG00000141510|rna_expression     |10   |
+------------------------

[32mimport [39m[36morg.apache.spark.sql.expressions.Window

// Take the first 10 evidence records for each target + data_type combination
[39m
[36mdfes[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [_corrupt_record: string, access_level: string ... 10 more fields]

In [7]:
assert(dfes.schema.toString == dfe.schema.toString)

In [8]:
// Make sure this result is of suitable size for application in testing
dfes.count()

[36mres7[39m: [32mLong[39m = [32m191L[39m

In [15]:
dfes.groupBy("sourceID", "type").count.show

+------------------+-------------------+-----+
|          sourceID|               type|count|
+------------------+-------------------+-----+
|        slapenrich|   affected_pathway|   23|
|         phenodigm|       animal_model|   30|
|    phewas_catalog|genetic_association|   16|
|  genomics_england| genetic_literature|   14|
|  expression_atlas|     rna_expression|   21|
|           progeny|   affected_pathway|    3|
|               eva|genetic_association|   13|
|cancer_gene_census|   somatic_mutation|   10|
|      gwas_catalog|genetic_association|   21|
|           uniprot|genetic_association|   10|
|uniprot_literature| genetic_literature|    5|
|            chembl|         known_drug|   20|
|         europepmc|         literature|    5|
+------------------+-------------------+-----+



### Gene Index

In [9]:
// Load gene data ES extract
val geneIds = genes.values.toSeq ++ GENE_SET_2 // Add misc genes for other corner cases
val dfg = ss.read.json(extractDir.resolve("gene.json").toString)
    .filter($"ensembl_gene_id".isin(geneIds: _*))
    .cache()
dfg.count()

[36mgeneIds[39m: [32mSeq[39m[[32mString[39m] = [33mStream[39m(
  [32m"ENSG00000169083"[39m,
  [32m"ENSG00000234745"[39m,
  [32m"ENSG00000105397"[39m,
  [32m"ENSG00000073756"[39m,
  [32m"ENSG00000141510"[39m,
  [32m"ENSG00000169174"[39m,
  [32m"ENSG00000204290"[39m,
  [32m"ENSG00000240253"[39m,
  [32m"ENSG00000270945"[39m,
  [32m"ENSG00000197984"[39m
)
[36mdfg[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [_private: struct<facets: struct<reactome: struct<pathway_code: array<string>, pathway_type_code: array<string>>>, suggestions: struct<input: array<string>, output: string ... 1 more field>>, alias_name: array<string> ... 60 more fields]
[36mres8_2[39m: [32mLong[39m = [32m10L[39m

In [10]:
dfg.groupBy("biotype").count.show

+--------------------+-----+
|             biotype|count|
+--------------------+-----+
|transcribed_unpro...|    2|
|processed_pseudogene|    1|
|      protein_coding|    7|
+--------------------+-----+



### EFO

In [73]:
def efo_codes = dfes.select("disease.id").distinct.collect().toSeq.map(_(0))

defined [32mfunction[39m [36mefo_codes[39m

In [82]:
// Load gene data ES extract
val dfefo = ss.read.json(extractDir.resolve("efo.json").toString)
    .filter($"code".isin(efo_codes:_*))
    .cache()
dfefo.select("code", "id").show(10, false)

+--------------------------------------------+--------------+
|code                                        |id            |
+--------------------------------------------+--------------+
|http://www.orpha.net/ORDO/Orphanet_110      |Orphanet_110  |
|http://www.ebi.ac.uk/efo/EFO_0004267        |EFO_0004267   |
|http://www.orpha.net/ORDO/Orphanet_754      |Orphanet_754  |
|http://www.orpha.net/ORDO/Orphanet_90797    |Orphanet_90797|
|http://www.ebi.ac.uk/efo/EFO_0000180        |EFO_0000180   |
|http://www.orpha.net/ORDO/Orphanet_481      |Orphanet_481  |
|http://www.ebi.ac.uk/efo/EFO_0000305        |EFO_0000305   |
|http://www.ebi.ac.uk/efo/EFO_1001422        |EFO_1001422   |
|http://purl.obolibrary.org/obo/MONDO_0008734|MONDO_0008734 |
|http://www.orpha.net/ORDO/Orphanet_99429    |Orphanet_99429|
+--------------------------------------------+--------------+
only showing top 10 rows



[36mdfefo[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mDataset[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mRow[39m] = [children: array<struct<code:string,label:string>>, code: string ... 11 more fields]

In [83]:
(dfefo.count(), efo_codes.size)

[36mres82[39m: ([32mLong[39m, [32mInt[39m) = ([32m85L[39m, [32m92[39m)

### Searching for Prototype Records

To create simulated data that can still pass validation against the json schema, it must closely match the structure assigned to each evidence data type (i.e. using a "somatic_mutation" record to test code relating to "genetic_association" data types will not work, because the original records have type-specific schemas).  This can be used to pull a single record and validate it before adding within simulated datasets:

In [24]:
import com.relatedsciences.opentargets.etl.pipeline.JsonValidation.RecordValidator
RecordValidator.url = Some(new java.net.URL("https://raw.githubusercontent.com/opentargets/json_schema/master/opentargets.json"))

[32mimport [39m[36mcom.relatedsciences.opentargets.etl.pipeline.JsonValidation.RecordValidator
[39m

In [42]:
dfes.filter($"type" === "genetic_association" && $"evidence.gene2variant.resource_score".isNotNull).toJSON.take(1).foreach(r => {
    val vr = RecordValidator.validate(r)
    println("Valid = " + vr.isValid)
    println("Invalid Reason = " + vr.reason)
    println(r)
})

Valid = true
Invalid Reason = None
{"access_level":"public","disease":{"id":"http://purl.obolibrary.org/obo/HP_0000157","name":"Abnormality of the tongue","source_name":"abnormality of the tongue"},"evidence":{"gene2variant":{"date_asserted":"2019-06-19T23:00:00","evidence_codes":["http://identifiers.org/eco/cttv_mapping_pipeline"],"functional_consequence":"http://purl.obolibrary.org/obo/SO_0001583","is_associated":true,"provenance_type":{"database":{"dbxref":{"id":"http://identifiers.org/clinvar","url":"http://identifiers.org/clinvar.record/RCV000735293","version":"2017-08"},"id":"EVA","version":"1.0"},"expert":{"statement":"Primary submitter of data","status":true}},"resource_score":{"method":{"description":"Not provided by data supplier"},"type":"pvalue","value":1.0E-7},"urls":[{"nice_name":"Further details in ClinVar database","url":"http://www.ncbi.nlm.nih.gov/clinvar/RCV000735293"}]},"variant2disease":{"clinical_significance":"Pathogenic","date_asserted":"2019-06-19T23:00:00","ev

In [41]:
dfes.select($"evidence.evidence_codes").show(10, false)

+--------------------------------------------+
|evidence_codes                              |
+--------------------------------------------+
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
|[http://purl.obolibrary.org/obo/ECO_0000053]|
+--------------------------------------------+
only showing top 10 rows



### Export

In [100]:
var path = testInputDir.resolve("gene_ese.json").toString
dfg.coalesce(1).write
    .format("json")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(path)
println(s"Saved ${dfg.count()} gene records to '$path'")

Saved 10 gene records to '/home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/gene_ese.json'


In [84]:
var path = testInputDir.resolve("efo_ese.json").toString
dfefo.coalesce(1).write
    .format("json")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(path)
println(s"Saved ${dfefo.count()} efo records to '$path'")

Saved 85 efo records to '/home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/efo_ese.json'


In [81]:
// Find a way to prevent this from overwriting simulated-records.json
var path = testInputDir.resolve("evidence_raw.json").toString
dfes.coalesce(1).write
    .format("json")
    .mode("overwrite")
    .option("compression", "gzip")
    .save(path)
println(s"Saved ${dfes.count()} evidence records to '$path'")

Saved 191 evidence records to '/home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/evidence_raw.json'
