# Target-disease genetic evidence from Open Targets Platform

This code is meant to prepare ranked lists of genes for all diseases from Open Targets platform with amount of genetically supported genes (genetically and somatic mutations for oncological traits) >= 500

In [16]:
import math
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, broadcast, countDistinct, array_contains
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("ProcessDiseasesNotebook").getOrCreate()

In [17]:
# 1. Load inputs by specifying your GCS paths directly
evidence_path = "gs://open-targets-data-releases/25.06/output/association_by_datasource_indirect"
target_path   = "gs://open-targets-data-releases/25.06/output/target"
disease_path  = "gs://open-targets-data-releases/25.06/output/disease"
# output_dir    = "gs://ot-team/polina/pathwaganda/processed_diseases"
# include_animal_models = False 

In [18]:
# 2. Read and prepare DataFrames
evidence = spark.read.parquet(evidence_path)
target_df = (
    spark.read.parquet(target_path)
         .select(col("id").alias("targetId"), col("approvedSymbol"))
)
evidence = evidence.join(broadcast(target_df), on="targetId", how="left")

disease_df = (
    spark.read.parquet(disease_path)
         .select(col("id").alias("diseaseId"), col("therapeuticAreas"))
)

evidence_ta = evidence.join(broadcast(disease_df), on="diseaseId", how="left")

# 3. Define your data source weights
weights = {
    # "ot_genetics_portal": 1,
    "gwas_credible_sets": 1,
    "gene_burden": 1,
    "eva": 1,
    "genomics_england": 1,
    "gene2phenotype": 1,
    "uniprot_literature": 1,
    "uniprot_variants": 1,
    "orphanet": 1,
    "clingen": 1,
    "cancer_gene_census": 1,
    "intogen": 1,
    "eva_somatic": 1,
    "cancer_biomarkers": 1,
    "chembl": 1,
    "crispr_screen": 1,
    "crispr": 1,
    "slapenrich": 0.5,
    "progeny": 0.5,
    "reactome": 1,
    "sysbio": 0.5,
    "europepmc": 0.2,
    "expression_atlas": 0.2,
    "impc": 0.2,
    "ot_crispr_validation": 0.5,
    "ot_crispr": 0.5,
    "encore": 0.5,
}

# 4. Build evidence type lists based on animal-model flag
# oncology_types     = ["genetic_association", "somatic_mutation"]
# non_oncology_types = ["genetic_association"]
# if include_animal_models:
#     oncology_types.append("animal_model")
#     non_oncology_types.append("animal_model")

In [19]:
evidence_onco = evidence_ta.filter(
    col("datatypeId").isin("genetic_association", "somatic_mutation")
    & array_contains(col("therapeuticAreas"), "MONDO_0045024")
)

In [21]:
evidence_non_onco = evidence_ta.filter(
    col("datatypeId").isin("genetic_association")
    & ~array_contains(col("therapeuticAreas"), "MONDO_0045024")
)

Output: 
diseaseId: targetId, approvedSymbol, overallScore

Input:
_diseaseId_ _targetId_, _approvedSymbol_, _datatypeId_, datasourceId, datasourceWeight, score
1. weightedScore = datasourceWeight * score
2. Sort by datasourceId
3. ordIndex = Compute order index of dataSource
4. Compute HSum by taking Sum(weightedScore_i / ordIndex_i^2) / 1.644

_diseaseId_ _targetId_, _approvedSymbol_, _datatypeId_, hSum
1. Sort by datatypeId
2. ordIndex = Compute order index of dataSource
3. Compute amount of datatypes per (_diseaseId_ _targetId_, _approvedSymbol_)
_diseaseId_ _targetId_, _approvedSymbol_, _datatypeId_, ordIndex, hSum, datatypes_amount
4. Compute HSum by taking Sum(hSum_i / ordIndex_i^2) / datatypes_amount
_diseaseId_ _targetId_, _approvedSymbol_, overallHSum

datatypeScore = weightedHsumNorm(datasourceScore(score))

overallScore = HsumNorm(datatypeScore) / 1.644

HsumNorm = score / (positional id)^2

In [24]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, broadcast, udf
from pyspark.sql.types import DoubleType

sc = spark.sparkContext

# --- 1) turn your Python dict into a tiny DataFrame & broadcast it ---
#    assume weights = { 'ds1':0.3, 'ds2':0.5, ... }
weights_df = spark.createDataFrame(
    [(k, float(v)) for k,v in weights.items()],
    ['datasourceId', 'datasourceWeight']
)

df = evidence_onco.join(
    broadcast(weights_df),
    on='datasourceId',
    how='left'
)

# --- 2) compute per‐row weightedScore ---
df = df.withColumn(
    'weightedScore',
    col('datasourceWeight') * col('score')
)

# --- 3) assign ordIndex by ranking weightedScore desc within each group ---
grp_cols = ['diseaseId','targetId','approvedSymbol','datatypeId']
order_win = Window.partitionBy(*grp_cols).orderBy(col('weightedScore').desc())

df = df.withColumn(
    'ordIndex',
    row_number().over(order_win)
)

# --- 4) count number of rows (datasources) per group ---
count_win = Window.partitionBy(*grp_cols)
df = df.withColumn(
    'nDs',
    F.count('*').over(count_win)
)

# --- 5) precompute Σ_{k=1..n} 1/k² for n=1..20, else 1.644 ---
theo = {i: sum(1.0/j**2 for j in range(1,i+1)) for i in range(1,21)}
broadcast_theo = sc.broadcast(theo)

@udf(DoubleType())
def max_theoretical(n):
    if n is None:
        return None
    return float(broadcast_theo.value[n]) if n <= 20 else 1.644

df = df.withColumn(
    'maxTheo',
    max_theoretical(col('nDs'))
)

# --- 6) scale each row by 1/ordIndex² and sum, then divide by maxTheo ---
df = df.withColumn(
    'scaled',
    col('weightedScore') / (col('ordIndex')**2)
)

result = (
    df
    .groupBy(*grp_cols)
    .agg(
        (F.sum('scaled') / F.first('maxTheo')).alias('hSum')
    )
)

# result now has: diseaseId, targetId, approvedSymbol, datatypeId, hSum


In [28]:
result.sort(-col("hSum")).filter(col("hSum")<0.8).show(5)

[Stage 74:>                                                         (0 + 2) / 2]

+-------------+---------------+--------------+-------------------+------------------+
|    diseaseId|       targetId|approvedSymbol|         datatypeId|              hSum|
+-------------+---------------+--------------+-------------------+------------------+
|MONDO_0002149|ENSG00000111276|        CDKN1B|genetic_association|0.7999966200371057|
|  EFO_0005952|ENSG00000171456|         ASXL1|   somatic_mutation|0.7999807407658193|
|  EFO_1000218|ENSG00000130396|          AFDN|   somatic_mutation|0.7999773630884303|
|MONDO_0002516|ENSG00000169184|           MN1|   somatic_mutation|0.7999730053348342|
|  EFO_0003859|ENSG00000100697|        DICER1|   somatic_mutation|0.7999362531293202|
+-------------+---------------+--------------+-------------------+------------------+
only showing top 5 rows



                                                                                

In [None]:
# 5. Precompute normalization constants
H_INF = math.pi**2 / 6.0

# Amount of datasources per evidence type (do not depend on disease)
ds_per_type = (
    evidence.select("datatypeId", "datasourceId")
            .distinct()
            .groupBy("datatypeId")
            .agg(F.countDistinct("datasourceId").alias("nDs"))
)
type_norms = {
    row.datatypeId: sum(1.0/(i*i) for i in range(1, row.nDs+1))
    for row in ds_per_type.collect()
}
type_norm_df = spark.createDataFrame(
    [(dt, float(norm)) for dt, norm in type_norms.items()],
    ["datatypeId", "typeNorm"]
)

# 6. Define processing function

def _compute_scores(ev, evidence_types):
    # Filter to relevant evidence types and diseases with ≥500 genes
    pattern = "|".join(evidence_types)
    valid = (
        ev.filter(col("datatypeId").rlike(pattern))
          .groupBy("diseaseId")
          .agg(countDistinct("approvedSymbol").alias("nGenes"))
          .filter(col("nGenes") >= 500)
    )

    # Stage 1: per-datasource harmonic sum + normalize → ds_score
    win_ds = Window.partitionBy(
        "diseaseId", "datasourceId", "approvedSymbol", "targetId"
    ).orderBy(F.desc("score"))
    ds_scores = (
        ev.join(valid.select("diseaseId"), on="diseaseId")
          .withColumn("rank_ds", F.row_number().over(win_ds))
          .withColumn("term_ds", col("score") / (col("rank_ds")**2))
          .groupBy("diseaseId", "datasourceId", "datatypeId",
                   "approvedSymbol", "targetId")
          .agg(F.sum("term_ds").alias("ds_sum"))
          .withColumn("ds_score", col("ds_sum") / F.lit(H_INF))
    )

    # Stage 2: per-datatype harmonic sum + normalize → associationScore
    weights_df = spark.createDataFrame(
        [(ds, float(w)) for ds, w in weights.items()],
        ["datasourceId", "weight"]
    )
    win_type = Window.partitionBy(
        "diseaseId", "datatypeId", "approvedSymbol", "targetId"
    ).orderBy(F.desc("weighted_ds"))
    type_scores = (
        ds_scores
          .join(weights_df, on="datasourceId", how="left")
          .withColumn("weighted_ds", col("ds_score") * col("weight"))
          .withColumn("rank_type", F.row_number().over(win_type))
          .withColumn("term_type", col("weighted_ds") / (col("rank_type")**2))
          .groupBy("diseaseId", "datatypeId", "approvedSymbol", "targetId")
          .agg(F.sum("term_type").alias("type_sum"))
          .join(type_norm_df, on="datatypeId", how="left")
          .withColumn("associationScore", col("type_sum") / col("typeNorm"))
    )

    # Stage 3: overall association score across all datasources
    # Harmonic sum of ds_score weighted by datasource weights, normalized by H_INF
    win_overall = Window.partitionBy(
        "diseaseId", "approvedSymbol", "targetId"
    ).orderBy(F.desc("weighted_ds_score"))
    overall_scores = (
        ds_scores
          .join(weights_df, on="datasourceId", how="left")
          .withColumn("weighted_ds_score", col("ds_score") * col("weight"))
          .withColumn("rank_overall", F.row_number().over(win_overall))
          .withColumn("term_overall", col("weighted_ds_score") / (col("rank_overall")**2))
          .groupBy("diseaseId", "approvedSymbol", "targetId")
          .agg(F.sum("term_overall").alias("overall_sum"))
          .withColumn("overallScore", col("overall_sum") / F.lit(H_INF))
    )

    return ds_scores, type_scores, overall_scores

                                                                                

In [11]:
# 7. Execute for oncology and non-oncology
oncology_ds, oncology_type, oncology_overall = _compute_scores(evidence, oncology_types)
# non_oncology_scores = _compute_scores(evidence, non_oncology_types)

In [None]:
# 8. Persist results
oncology_output_path = "gs://ot-team/polina/pathwaganda/processed_diseases/oncology/"
# non_oncology_output_path = "gs://ot-team/polina/pathwaganda/processed_diseases/non_oncology/"

# Select only the needed columns and write partitioned by diseaseId
for df, out_path in [
    (oncology_overall, oncology_output_path)
    # (non_oncology_overall, non_oncology_overall_path)
]:
    df.select("diseaseId", "targetId", "approvedSymbol", "overallScore") \
      .write \
      .mode("overwrite") \
      .partitionBy("diseaseId") \
      .parquet(out_path)
    
    
# non_oncology_scores.write.mode("overwrite").parquet(non_oncology_output_path)

[Stage 30:>   (0 + 4) / 6][Stage 31:>   (0 + 0) / 2][Stage 32:>   (0 + 0) / 5]

In [14]:
# 6. Process oncology

ev_onc = (
    evidence.join(broadcast(disease_df), on="diseaseId")
            .filter(array_contains(col("therapeuticAreas"), ONCOLOGY_ID))
)
result_onc = _compute_scores(ev_onc, oncology_types)
# Write out
result_onc.select("diseaseId","approvedSymbol","targetId","overallScore") \
           .repartition("diseaseId") \
           .write.mode("overwrite") \
           .partitionBy("diseaseId") \
           .parquet(f"{output_dir}/oncology")

                                                                                

In [20]:
# File check
spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/oncology/diseaseId=EFO_0000095").count()

5801

In [21]:
from pyspark.sql.functions import desc

spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/oncology/diseaseId=EFO_0000095").sort(desc("overallScore")).show(4)

+--------------+---------------+------------------+
|approvedSymbol|       targetId|      overallScore|
+--------------+---------------+------------------+
|          TP53|ENSG00000141510|1.0734668108113399|
|         SF3B1|ENSG00000115524| 1.039870844982318|
|           ATM|ENSG00000149311| 1.026074643823489|
|          POT1|ENSG00000128513|1.0005294815256116|
+--------------+---------------+------------------+
only showing top 4 rows



In [22]:
# 7. Process non-oncology

ev_non = (
    evidence.join(broadcast(disease_df), on="diseaseId")
            .filter(~array_contains(col("therapeuticAreas"), ONCOLOGY_ID))
)
result_non = _compute_scores(ev_non, non_oncology_types)
# Write out
result_non.select("diseaseId","approvedSymbol","targetId","overallScore") \
           .repartition("diseaseId") \
           .write.mode("overwrite") \
           .partitionBy("diseaseId") \
           .parquet(f"{output_dir}/non_oncology")

                                                                                

In [24]:
spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/non_oncology/diseaseId=EFO_0000195").count()

4669

In [25]:
spark.read.parquet("gs://ot-team/polina/pathwaganda/processed_diseases/non_oncology/diseaseId=EFO_0000195").sort(desc("overallScore")).show(4)

+--------------+---------------+------------------+
|approvedSymbol|       targetId|      overallScore|
+--------------+---------------+------------------+
|        DYRK1B|ENSG00000105204|1.1051797884522627|
|          APOB|ENSG00000084674|0.9262022437993443|
|           LPL|ENSG00000175445|0.8864316879980596|
|          LIPC|ENSG00000166035|0.8602447264212504|
+--------------+---------------+------------------+
only showing top 4 rows

