In [1]:
import java.nio.file.Paths
import org.apache.spark.sql.functions._
val dir1 = Paths.get(System.getProperty("user.home"), "data", "ot", "results")
val dir2 = Paths.get(System.getProperty("user.home"), "data", "ot", "extract")

dir1 = /home/eczech/data/ot/results
dir2 = /home/eczech/data/ot/extract


/home/eczech/data/ot/extract

## Load RS Data

In [19]:
val df1 = spark.read.parquet(dir1.resolve("score_association.parquet").toString)

df1 = [target_id: string, disease_id: string ... 3 more fields]


[target_id: string, disease_id: string ... 3 more fields]

In [20]:
df1.show(3)

+---------------+--------------+-------------------+---------+-----------+
|      target_id|    disease_id|              score|is_direct| source_ids|
+---------------+--------------+-------------------+---------+-----------+
|ENSG00000000460|Orphanet_93420|             0.1817|    false|[phenodigm]|
|ENSG00000000938|   EFO_0004244|                0.1|    false|   [chembl]|
|ENSG00000000938|Orphanet_98710|0.23667500000000002|    false|[phenodigm]|
+---------------+--------------+-------------------+---------+-----------+
only showing top 3 rows



In [21]:
df1.count()

1973478

In [22]:
val df12 = df1.select($"target_id", $"disease_id", $"score".as("score_rs"))
df12.show(3)

+---------------+--------------+-------------------+
|      target_id|    disease_id|           score_rs|
+---------------+--------------+-------------------+
|ENSG00000000460|Orphanet_93420|             0.1817|
|ENSG00000000938|   EFO_0004244|                0.1|
|ENSG00000000938|Orphanet_98710|0.23667500000000002|
+---------------+--------------+-------------------+
only showing top 3 rows



df12 = [target_id: string, disease_id: string ... 1 more field]


[target_id: string, disease_id: string ... 1 more field]

### Load OT Data

In [6]:
val df2 = spark.read.json(dir2.resolve("association.json").toString)

df2 = [disease: struct<efo_info: struct<efo_id: string, label: string ... 2 more fields>, id: string>, evidence_count: struct<datasources: struct<cancer_gene_census: double, chembl: double ... 19 more fields>, datatypes: struct<affected_pathway: double, animal_model: double ... 5 more fields> ... 1 more field> ... 7 more fields]


[disease: struct<efo_info: struct<efo_id: string, label: string ... 2 more fields>, id: string>, evidence_count: struct<datasources: struct<cancer_gene_census: double, chembl: double ... 19 more fields>, datatypes: struct<affected_pathway: double, animal_model: double ... 5 more fields> ... 1 more field> ... 7 more fields]

In [7]:
df2.show(3)

+--------------------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|             disease|      evidence_count|        harmonic-sum|                  id|is_direct|                 max|             private|                 sum|              target|
+--------------------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|[[http://www.orph...|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|ENSG00000132383-O...|    false|[[0.0, 0.0, 0.0, ...|[[[phenodigm], [a...|[[0.0, 0.0, 0.0, ...|[[ENSG00000132383...|
|[[http://www.orph...|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|ENSG00000125551-O...|     true|[[0.0, 0.0, 0.0, ...|[[[phenodigm], [a...|[[0.0, 0.0, 0.0, ...|[[ENSG00000125551...|
|[[http://www.orph...|[[0.0, 0.0, 0.0, ...|[[0.0, 0.0, 0.0, ...|ENSG00000141756-O...|     true|[[0.0

In [8]:
df2.printSchema()

root
 |-- disease: struct (nullable = true)
 |    |-- efo_info: struct (nullable = true)
 |    |    |-- efo_id: string (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- path: array (nullable = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |-- therapeutic_area: struct (nullable = true)
 |    |    |    |-- codes: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- labels: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |-- id: string (nullable = true)
 |-- evidence_count: struct (nullable = true)
 |    |-- datasources: struct (nullable = true)
 |    |    |-- cancer_gene_census: double (nullable = true)
 |    |    |-- chembl: double (nullable = true)
 |    |    |-- crispr: double (nullable = true)
 |    |    |-- europepmc: double (nullable = true)
 |    |    |-- eva: doub

In [9]:
val df22 = df2.select(
    $"target.id".as("target_id"),
    $"disease.id".as("disease_id"),
    $"harmonic-sum.overall".as("score_ot")
)
df22.show(3)

+---------------+---------------+-------------------+
|      target_id|     disease_id|           score_ot|
+---------------+---------------+-------------------+
|ENSG00000132383|Orphanet_240371|0.24183500000000002|
|ENSG00000125551|   Orphanet_974|0.24638833333333335|
|ENSG00000141756|Orphanet_314811| 0.6649999999999999|
+---------------+---------------+-------------------+
only showing top 3 rows



df22 = [target_id: string, disease_id: string ... 1 more field]


[target_id: string, disease_id: string ... 1 more field]

In [17]:
df22.count()

980001

## Merge

In [23]:
val df = df12.join(df22, Seq("target_id", "disease_id"), "inner")
df.show(3)

+---------------+--------------+-------------------+-------------------+
|      target_id|    disease_id|           score_rs|           score_ot|
+---------------+--------------+-------------------+-------------------+
|ENSG00000000938|Orphanet_44890|                0.2|                0.2|
|ENSG00000000971| Orphanet_1505|0.23495000000000002|0.23495000000000002|
|ENSG00000000971|Orphanet_98687|                0.2|                0.2|
+---------------+--------------+-------------------+-------------------+
only showing top 3 rows



df = [target_id: string, disease_id: string ... 2 more fields]


[target_id: string, disease_id: string ... 2 more fields]

In [24]:
df.withColumn("ratio", $"score_rs" / $"score_ot").show(10)

+---------------+---------------+-------------------+-------------------+-----+
|      target_id|     disease_id|           score_rs|           score_ot|ratio|
+---------------+---------------+-------------------+-------------------+-----+
|ENSG00000000938| Orphanet_44890|                0.2|                0.2|  1.0|
|ENSG00000000971|  Orphanet_1505|0.23495000000000002|0.23495000000000002|  1.0|
|ENSG00000000971| Orphanet_98687|                0.2|                0.2|  1.0|
|ENSG00000001497|    EFO_0000178|           0.405682|           0.405682|  1.0|
|ENSG00000001630|Orphanet_294060|0.22661500000000004|0.22661500000000004|  1.0|
|ENSG00000002330|   Orphanet_282|            0.18206|            0.18206|  1.0|
|ENSG00000002549|    EFO_0003853|                0.1|                0.1|  1.0|
|ENSG00000002746| Orphanet_85292|0.18510000000000001|0.18510000000000001|  1.0|
|ENSG00000003137|    EFO_0000186|0.00992751340866456|0.00992751340866456|  1.0|
|ENSG00000003137|    EFO_1002008|0.04610

In [26]:
df.withColumn("isequal", abs($"score_rs" - $"score_ot") < .01).groupBy("isequal").count().show()

+-------+------+
|isequal| count|
+-------+------+
|   true|979752|
|  false|    13|
+-------+------+



In [25]:
df.withColumn("ratio", $"score_rs" / $"score_ot").select("ratio").describe().show()

+-------+--------------------+
|summary|               ratio|
+-------+--------------------+
|  count|              979765|
|   mean|  1.0010292118461759|
| stddev|  0.4112070254835414|
|    min|0.008044634791842217|
|    max|  227.22505429882958|
+-------+--------------------+

