# Target-disease genetic evidence from Open Targets Platform

This code is meant to prepare ranked lists of genes for all diseases from Open Targets platform with amount of genetically supported genes (genetically and somatic mutations for oncological traits) >= 500

In [1]:
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.functions import (
    col, countDistinct, row_number, sum as spark_sum,
    broadcast, array_contains
)
import gcsfs

spark = SparkSession.builder.appName("ProcessDiseasesNotebook").getOrCreate()

25/07/08 15:38:10 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.Con

In [2]:
# Oncology MONDO code constant
ONCOLOGY_ID = "MONDO_0045024"


# 1. Load inputs by specifying your GCS paths directly
evidence_path = "gs://open-targets-data-releases/25.06/output/association_by_datasource_indirect"
target_path   = "gs://open-targets-data-releases/25.06/output/target"
disease_path  = "gs://open-targets-data-releases/25.06/output/disease"
output_dir    = "gs://ot-team/polina/pathwaganda/processed_diseases"
include_animal_models = False 

In [4]:
spark.read.parquet(disease_path).show(4)

                                                                                

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          id|                code|                name|         description|             dbXRefs|             parents|            synonyms|obsoleteTerms|obsoleteXRefs|            children|           ancestors|    therapeuticAreas|         descendants|            ontology|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|DOID_0050890|http://purl.oboli...|     synucleinopathy|A neurodegenerati...|[MESH:D000080874,...|[MONDO_0019052, M...|{[alpha Synuclein...|           []|           []|[EFO_00067

In [11]:
# 2. Read and prepare DataFrames

evidence = spark.read.parquet(evidence_path)
target_df = (
    spark.read.parquet(target_path)
         .select(col("id").alias("targetId"), col("approvedSymbol"))
)
evidence = evidence.join(broadcast(target_df), on="targetId", how="left")

disease_df = (
    spark.read.parquet(disease_path)
         .select(col("id").alias("diseaseId"), col("therapeuticAreas"))
)

# 3. Define your data source weights
weights = {
    # "ot_genetics_portal": 1,
    "gwas_credible_sets": 1,
    "gene_burden": 1,
    "eva": 1,
    "genomics_england": 1,
    "gene2phenotype": 1,
    "uniprot_literature": 1,
    "uniprot_variants": 1,
    "orphanet": 1,
    "clingen": 1,
    "cancer_gene_census": 1,
    "intogen": 1,
    "eva_somatic": 1,
    "cancer_biomarkers": 1,
    "chembl": 1,
    "crispr_screen": 1,
    "crispr": 1,
    "slapenrich": 0.5,
    "progeny": 0.5,
    "reactome": 1,
    "sysbio": 0.5,
    "europepmc": 0.2,
    "expression_atlas": 0.2,
    "impc": 0.2,
    "ot_crispr_validation": 0.5,
    "ot_crispr": 0.5,
    "encore": 0.5,
}

# 4. Build evidence type lists based on animal-model flag
oncology_types     = ["genetic_association", "somatic_mutation"]
non_oncology_types = ["genetic_association"]
if include_animal_models:
    oncology_types.append("animal_model")
    non_oncology_types.append("animal_model")


In [13]:
# 5. Define processing functions

def _compute_scores(ev: DataFrame, evidence_types: list) -> DataFrame:
    # Build regex from types
    pattern = "|".join(evidence_types)

    # Filter by types and by ≥500 genes
    valid = (
        ev.filter(col("datatypeId").rlike(pattern))
          .groupBy("diseaseId")
          .agg(countDistinct("approvedSymbol").alias("nGenes"))
          .filter(col("nGenes") >= 500)
    )
    # Construct weights DataFrame with consistent DoubleType
    weights_df = spark.createDataFrame(
        [(ds, float(w)) for ds, w in weights.items()],
        ["datasourceId", "weight"]
    )

    df = (
        ev.join(valid.select("diseaseId"), on="diseaseId")
          .join(broadcast(weights_df), on="datasourceId", how="left")
          .withColumn("score_weighted", col("score") * col("weight"))
    )

    # First-level window
    win1 = Window.partitionBy(
        "diseaseId", "datatypeId", "approvedSymbol", "targetId"
    ).orderBy(col("score_weighted").desc())
    df1 = (
        df.withColumn("rank1", row_number().over(win1))
          .withColumn("term1", col("score_weighted") / (col("rank1") ** 2))
          .groupBy("diseaseId", "datatypeId", "approvedSymbol", "targetId")
          .agg(spark_sum("term1").alias("sourceSum"))
    )

    # Second-level window
    win2 = Window.partitionBy(
        "diseaseId", "approvedSymbol", "targetId"
    ).orderBy(col("sourceSum").desc())
    df2 = (
        df1.withColumn("rank2", row_number().over(win2))
           .withColumn("overallScore", col("sourceSum") / (col("rank2") ** 2))
           .filter(col("overallScore").isNotNull())
    )
    return df2

In [14]:
# 6. Process oncology

ev_onc = (
    evidence.join(broadcast(disease_df), on="diseaseId")
            .filter(array_contains(col("therapeuticAreas"), ONCOLOGY_ID))
)
result_onc = _compute_scores(ev_onc, oncology_types)
# Write out
result_onc.select("diseaseId","approvedSymbol","targetId","overallScore") \
           .repartition("diseaseId") \
           .write.mode("overwrite") \
           .partitionBy("diseaseId") \
           .parquet(f"{output_dir}/oncology")

                                                                                

In [20]:
# File check
spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/oncology/diseaseId=EFO_0000095").count()

5801

In [21]:
from pyspark.sql.functions import desc

spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/oncology/diseaseId=EFO_0000095").sort(desc("overallScore")).show(4)

+--------------+---------------+------------------+
|approvedSymbol|       targetId|      overallScore|
+--------------+---------------+------------------+
|          TP53|ENSG00000141510|1.0734668108113399|
|         SF3B1|ENSG00000115524| 1.039870844982318|
|           ATM|ENSG00000149311| 1.026074643823489|
|          POT1|ENSG00000128513|1.0005294815256116|
+--------------+---------------+------------------+
only showing top 4 rows



In [22]:
# 7. Process non-oncology

ev_non = (
    evidence.join(broadcast(disease_df), on="diseaseId")
            .filter(~array_contains(col("therapeuticAreas"), ONCOLOGY_ID))
)
result_non = _compute_scores(ev_non, non_oncology_types)
# Write out
result_non.select("diseaseId","approvedSymbol","targetId","overallScore") \
           .repartition("diseaseId") \
           .write.mode("overwrite") \
           .partitionBy("diseaseId") \
           .parquet(f"{output_dir}/non_oncology")

                                                                                

In [24]:
spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/non_oncology/diseaseId=EFO_0000195").count()

4669

In [25]:
spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/non_oncology/diseaseId=EFO_0000195").sort(desc("overallScore")).show(4)

+--------------+---------------+------------------+
|approvedSymbol|       targetId|      overallScore|
+--------------+---------------+------------------+
|        DYRK1B|ENSG00000105204|1.1051797884522627|
|          APOB|ENSG00000084674|0.9262022437993443|
|           LPL|ENSG00000175445|0.8864316879980596|
|          LIPC|ENSG00000166035|0.8602447264212504|
+--------------+---------------+------------------+
only showing top 4 rows

