In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, concat_ws, collect_list

import pandas as pd

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/22 15:01:47 INFO SparkEnv: Registering MapOutputTracker
24/03/22 15:01:47 INFO SparkEnv: Registering BlockManagerMaster
24/03/22 15:01:47 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/03/22 15:01:47 INFO SparkEnv: Registering OutputCommitCoordinator


## Target prioritisation validation based on Open Targets score for IBD

### Export Target-IBD associations from Open Targets platform
##### Sources of evidence: Genetic evidence (excluding PanelApp, gene2phenotype, Orphanet, Clingen), Animal models, Expression Atlas

In [16]:
# Target-Disease indirect (propagated) evidence from Open Targets Platform
evidence_path_ByDatatype = "gs://open-targets-data-releases/23.12/output/etl/parquet/associationByDatatypeIndirect"
evidence_ByDatatype = spark.read.parquet(evidence_path_ByDatatype)

# Filter to only IBD associations
evidence_ibd_ByDatatype = evidence_ByDatatype.filter(
    (col("diseaseId") == "EFO_0003767") 
).persist()

# Filter to only IBD associations by any genetic_association, animal_model
targets_ibd_ByDatatype = evidence_ibd_ByDatatype \
    .groupBy("targetId") \
    .agg(concat_ws(";", collect_list("datatypeId")).alias("datatypeId")) 

24/03/22 15:29:12 WARN CacheManager: Asked to cache already cached data.


In [18]:
targets_ibd_ByDatatype.show()



+---------------+--------------------+
|       targetId|         datatypeIds|
+---------------+--------------------+
|ENSG00000000938|literature;affect...|
|ENSG00000000971|          literature|
|ENSG00000001084|          literature|
|ENSG00000001626|animal_model;gene...|
|ENSG00000001630|          literature|
|ENSG00000002016|          literature|
|ENSG00000002330|          literature|
|ENSG00000002549|          literature|
|ENSG00000002726|          literature|
|ENSG00000002746|          literature|
|ENSG00000002822|          literature|
|ENSG00000002933|          literature|
|ENSG00000003137|          literature|
|ENSG00000003147|          literature|
|ENSG00000003400|          literature|
|ENSG00000003402|          literature|
|ENSG00000003436|literature;rna_ex...|
|ENSG00000003989|          literature|
|ENSG00000004139|          literature|
|ENSG00000004399|          literature|
+---------------+--------------------+
only showing top 20 rows



                                                                                

In [17]:
# Filter to only genetic_association (any), animal_model, rna_expression

targets_ibd_filter = targets_ibd_ByDatatype.filter(
        F.col("datatypeId").contains("genetic_association") |
        F.col("datatypeId").contains("animal_model") |
        F.col("datatypeId").contains("rna_expression"))

targets_ibd_pd = targets_ibd_filter.toPandas()
targets_ibd_pd

AnalysisException: Column 'datatypeId' does not exist. Did you mean one of the following? [datatypeIds, targetId];
'Filter ((Contains('datatypeId, genetic_association) OR Contains('datatypeId, animal_model)) OR Contains('datatypeId, rna_expression))
+- Aggregate [targetId#1262], [targetId#1262, concat_ws(;, collect_list(datatypeId#1263, 0, 0)) AS datatypeIds#1278]
   +- Filter (diseaseId#1261 = EFO_0003767)
      +- Relation [diseaseId#1261,targetId#1262,datatypeId#1263,score#1264,evidenceCount#1265L] parquet


### Export dataset with known IBD drug targets

In [9]:
drug_targets = evidence_ibd.filter(col("datasourceId").contains("chembl"))
drug_targets.count()

275

In [10]:
drug_targets_pd = drug_targets.toPandas()
drug_targets_pd

Unnamed: 0,datatypeId,datasourceId,diseaseId,targetId,score,evidenceCount
0,known_drug,chembl,EFO_0003767,ENSG00000004779,0.165492,3
1,known_drug,chembl,EFO_0003767,ENSG00000006210,0.143539,3
2,known_drug,chembl,EFO_0003767,ENSG00000007314,0.788195,7
3,known_drug,chembl,EFO_0003767,ENSG00000010671,0.151983,2
4,known_drug,chembl,EFO_0003767,ENSG00000011677,0.870818,7
...,...,...,...,...,...,...
270,known_drug,chembl,EFO_0003767,ENSG00000267855,0.165492,3
271,known_drug,chembl,EFO_0003767,ENSG00000268089,0.607931,1
272,known_drug,chembl,EFO_0003767,ENSG00000273079,0.030397,1
273,known_drug,chembl,EFO_0003767,ENSG00000274286,0.121586,1


In [11]:
drug_targets_pd.to_csv("ibd_drug_targets.csv", index=False)