## Unit Test Dataset Notebook

This notebook should be used to generate and/or debug datasets for unit tests, stored as resources on the project.

## Scoring Tests

In [28]:
import java.nio.file.Paths
val input_dir = Paths.get(System.getProperty("user.home"), "data", "ot", "extract")
val input_file = input_dir.resolve("evidence.json")
val df = spark.read.json(input_file.toString())

input_dir = /home/eczech/data/ot/extract
input_file = /home/eczech/data/ot/extract/evidence.json
df = [access_level: string, disease: struct<biosample: struct<id: string, name: string>, efo_info: struct<efo_id: string, label: string ... 2 more fields> ... 3 more fields> ... 12 more fields]


[access_level: string, disease: struct<biosample: struct<id: string, name: string>, efo_info: struct<efo_id: string, label: string ... 2 more fields> ... 3 more fields> ... 12 more fields]

In [31]:
df.select("type").groupBy("type").count().show()

+-------------------+------+
|               type| count|
+-------------------+------+
|   affected_pathway| 87104|
|     rna_expression|204229|
|       animal_model|500683|
|   somatic_mutation| 70554|
|         known_drug|383122|
|genetic_association|380852|
+-------------------+------+



In [45]:
val genes = List(
    "ENSG00000141510", // TP53
    "ENSG00000244734", // HBB
    "ENSG00000169174", // PCSK9
    "ENSG00000112715", // VEGFA
    "ENSG00000105397"  // TYK2
)
import org.apache.spark.sql.functions.count
df.filter($"target.id".isin(genes: _*))
    .groupBy("target.id").pivot("type")
    .agg(count("id"))
    .show(100, 16)

+---------------+----------------+------------+-------------------+----------+--------------+----------------+
|             id|affected_pathway|animal_model|genetic_association|known_drug|rna_expression|somatic_mutation|
+---------------+----------------+------------+-------------------+----------+--------------+----------------+
|ENSG00000141510|             870|        1308|                587|      null|            17|            3926|
|ENSG00000112715|              17|         433|                253|      2047|            30|            null|
|ENSG00000105397|              20|          58|                117|        97|             1|            null|
|ENSG00000244734|               6|        null|                355|         5|            81|            null|
|ENSG00000169174|               6|          35|                169|       112|            20|            null|
+---------------+----------------+------------+-------------------+----------+--------------+----------------+



genes = List(ENSG00000141510, ENSG00000244734, ENSG00000169174, ENSG00000112715, ENSG00000105397)


List(ENSG00000141510, ENSG00000244734, ENSG00000169174, ENSG00000112715, ENSG00000105397)

## Pipeline Output Test

In [50]:
var repoDir = Paths.get(System.getProperty("user.home"), "repos", "ot-scoring")

repoDir = /home/eczech/repos/ot-scoring


/home/eczech/repos/ot-scoring

In [59]:
var path = repoDir.resolve("target/scala-2.11/test-classes/pipeline_test/output/score_association.parquet")
var df = spark.read.parquet(path.toString)
println(df.count())
df.show(3)

2129
+---------------+-----------+-----+---------+------------+
|      target_id| disease_id|score|is_direct|  source_ids|
+---------------+-----------+-----+---------+------------+
|ENSG00000105397|EFO_0000095|  0.1|     true|    [chembl]|
|ENSG00000105397|EFO_0000096|  0.1|    false|    [chembl]|
|ENSG00000105397|EFO_0000181|  0.5|     true|[slapenrich]|
+---------------+-----------+-----+---------+------------+
only showing top 3 rows



path = /home/eczech/repos/ot-scoring/target/scala-2.11/test-classes/pipeline_test/output/score_association.parquet
df = [target_id: string, disease_id: string ... 3 more fields]


[target_id: string, disease_id: string ... 3 more fields]

In [58]:
var path = repoDir.resolve("target/scala-2.11/test-classes/pipeline_test/output/score_source.parquet")
var df = spark.read.parquet(path.toString)
println(df.count())
df.show(3)

2965
+---------------+-----------+----------+---------+---------+-----+
|      target_id| disease_id| source_id|score_raw|is_direct|score|
+---------------+-----------+----------+---------+---------+-----+
|ENSG00000105397|EFO_0000095|    chembl|      0.1|     true|  0.1|
|ENSG00000105397|EFO_0000096|    chembl|      0.1|    false|  0.1|
|ENSG00000105397|EFO_0000181|slapenrich|      0.5|     true|  0.5|
+---------------+-----------+----------+---------+---------+-----+
only showing top 3 rows



path = /home/eczech/repos/ot-scoring/target/scala-2.11/test-classes/pipeline_test/output/score_source.parquet
df = [target_id: string, disease_id: string ... 4 more fields]


[target_id: string, disease_id: string ... 4 more fields]

In [56]:
var path = repoDir.resolve("src/test/resources/pipeline_test/input/association_scores.json")
var df = spark.read.json(path.toString)
println(df.count())
df.show(3)

2129
+--------------+--------------------+---------------+
|    disease_id|               score|      target_id|
+--------------+--------------------+---------------+
|   EFO_0003867|1.338723084160192...|ENSG00000169174|
|   EFO_0003843| 2.09254289568363E-5|ENSG00000169174|
|Orphanet_79211|  1.0003361559218569|ENSG00000169174|
+--------------+--------------------+---------------+
only showing top 3 rows



path = /home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/association_scores.json
df = [disease_id: string, score: double ... 1 more field]


[disease_id: string, score: double ... 1 more field]

In [57]:
var path = repoDir.resolve("src/test/resources/pipeline_test/input/source_scores.json")
var df = spark.read.json(path.toString)
println(df.count())
df.show(3)

2965
+--------------+--------------------+--------------+---------------+
|    disease_id|               score|     source_id|      target_id|
+--------------+--------------------+--------------+---------------+
|   EFO_0003867|1.338723084160192...|phewas_catalog|ENSG00000169174|
|   EFO_0003843| 2.09254289568363E-5|phewas_catalog|ENSG00000169174|
|Orphanet_79211|                 1.0|        chembl|ENSG00000169174|
+--------------+--------------------+--------------+---------------+
only showing top 3 rows



path = /home/eczech/repos/ot-scoring/src/test/resources/pipeline_test/input/source_scores.json
df = [disease_id: string, score: double ... 2 more fields]


[disease_id: string, score: double ... 2 more fields]

## Debugging Full Pipeline Test

In [12]:
var path = "/home/eczech/repos/ot-scoring/target/scala-2.11/test-classes/pipeline_test/output"
//var df = spark.read.parquet(path + "/score_source.parquet")
var df = spark.read.parquet(path + "/score_evidence.parquet")
df.show(3)

+--------------------+---------------+-------------------+---------------+------------------+--------------+------------+-------------------+---+-------------------+
|                  id|      source_id|terminal_disease_id|      target_id|    score_resource|    disease_id|is_direct_id|       score_source|rid|              score|
+--------------------+---------------+-------------------+---------------+------------------+--------------+------------+-------------------+---+-------------------+
|ae12a8d819d094399...|uniprot_somatic|        EFO_0000640|ENSG00000105976|               1.0|   EFO_0003086|       false|                1.0|  1|                1.0|
|fa5ac4412ee2b1d03...|      phenodigm|        Orphanet_28|ENSG00000146085|0.9945999999999999|Orphanet_79062|       false|0.19891999999999999|  1|0.19891999999999999|
|5a74c8828948c1c41...|        uniprot|       Orphanet_510|ENSG00000165704|               0.0|Orphanet_71859|       false|                0.0|  1|                0.0|
+---

path = /home/eczech/repos/ot-scoring/target/scala-2.11/test-classes/pipeline_test/output
df = [id: string, source_id: string ... 8 more fields]


[id: string, source_id: string ... 8 more fields]

In [13]:
df.printSchema

root
 |-- id: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- terminal_disease_id: string (nullable = true)
 |-- target_id: string (nullable = true)
 |-- score_resource: double (nullable = true)
 |-- disease_id: string (nullable = true)
 |-- is_direct_id: boolean (nullable = true)
 |-- score_source: double (nullable = true)
 |-- rid: integer (nullable = true)
 |-- score: double (nullable = true)



In [16]:
df.filter($"source_id" === "phenodigm").show(3)

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 11.0 failed 1 times, most recent failure: Lost task 0.0 in stage 11.0 (TID 13, localhost, executor driver): java.io.FileNotFoundException: File file:/home/eczech/repos/ot-scoring/target/scala-2.11/test-classes/pipeline_test/output/score_evidence.parquet/part-00027-cb4cba73-7949-425a-a907-84cf2135e731-c000.snappy.parquet does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:127)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:177)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.sca