# Target-disease genetic evidence from Open Targets Platform

This code is meant to prepare ranked lists of genes for all diseases from Open Targets platform with amount of genetically supported genes (genetically and somatic mutations for oncological traits)

In [16]:
import math
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, broadcast, countDistinct, array_contains
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("ProcessDiseasesNotebook").getOrCreate()

In [17]:
# 1. Load inputs by specifying your GCS paths directly
evidence_path = "gs://open-targets-data-releases/25.06/output/association_by_datasource_indirect"
target_path   = "gs://open-targets-data-releases/25.06/output/target"
disease_path  = "gs://open-targets-data-releases/25.06/output/disease"
# output_dir    = "gs://ot-team/polina/pathwaganda/processed_diseases"
# include_animal_models = False 

In [18]:
# 2. Read and prepare DataFrames
evidence = spark.read.parquet(evidence_path)
target_df = (
    spark.read.parquet(target_path)
         .select(col("id").alias("targetId"), col("approvedSymbol"))
)
evidence = evidence.join(broadcast(target_df), on="targetId", how="left")

disease_df = (
    spark.read.parquet(disease_path)
         .select(col("id").alias("diseaseId"), col("therapeuticAreas"))
)

evidence_ta = evidence.join(broadcast(disease_df), on="diseaseId", how="left")

# 3. Define your data source weights
weights = {
    # "ot_genetics_portal": 1,
    "gwas_credible_sets": 1,
    "gene_burden": 1,
    "eva": 1,
    "genomics_england": 1,
    "gene2phenotype": 1,
    "uniprot_literature": 1,
    "uniprot_variants": 1,
    "orphanet": 1,
    "clingen": 1,
    "cancer_gene_census": 1,
    "intogen": 1,
    "eva_somatic": 1,
    "cancer_biomarkers": 1,
    "chembl": 1,
    "crispr_screen": 1,
    "crispr": 1,
    "slapenrich": 0.5,
    "progeny": 0.5,
    "reactome": 1,
    "sysbio": 0.5,
    "europepmc": 0.2,
    "expression_atlas": 0.2,
    "impc": 0.2,
    "ot_crispr_validation": 0.5,
    "ot_crispr": 0.5,
    "encore": 0.5,
}

# 4. Build evidence type lists based on animal-model flag
# oncology_types     = ["genetic_association", "somatic_mutation"]
# non_oncology_types = ["genetic_association"]
# if include_animal_models:
#     oncology_types.append("animal_model")
#     non_oncology_types.append("animal_model")

In [19]:
evidence_onco = evidence_ta.filter(
    col("datatypeId").isin("genetic_association", "somatic_mutation")
    & array_contains(col("therapeuticAreas"), "MONDO_0045024")
)

In [21]:
evidence_non_onco = evidence_ta.filter(
    col("datatypeId").isin("genetic_association")
    & ~array_contains(col("therapeuticAreas"), "MONDO_0045024")
)

Output: 
diseaseId: targetId, approvedSymbol, overallScore

Input:
_diseaseId_ _targetId_, _approvedSymbol_, _datatypeId_, datasourceId, datasourceWeight, score
1. weightedScore = datasourceWeight * score
2. Sort by datasourceId
3. ordIndex = Compute order index of dataSource
4. Compute HSum by taking Sum(weightedScore_i / ordIndex_i^2) / 1.644

_diseaseId_ _targetId_, _approvedSymbol_, _datatypeId_, hSum
1. Sort by datatypeId
2. ordIndex = Compute order index of dataSource
3. Compute amount of datatypes per (_diseaseId_ _targetId_, _approvedSymbol_)
_diseaseId_ _targetId_, _approvedSymbol_, _datatypeId_, ordIndex, hSum, datatypes_amount
4. Compute HSum by taking Sum(hSum_i / ordIndex_i^2) / datatypes_amount
_diseaseId_ _targetId_, _approvedSymbol_, overallHSum

datatypeScore = weightedHsumNorm(datasourceScore(score))

overallScore = HsumNorm(datatypeScore) / 1.644

HsumNorm = score / (positional id)^2

In [24]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, broadcast, udf
from pyspark.sql.types import DoubleType

sc = spark.sparkContext

# --- 1) turn your Python dict into a tiny DataFrame & broadcast it ---
#    assume weights = { 'ds1':0.3, 'ds2':0.5, ... }
weights_df = spark.createDataFrame(
    [(k, float(v)) for k,v in weights.items()],
    ['datasourceId', 'datasourceWeight']
)

df = evidence_onco.join(
    broadcast(weights_df),
    on='datasourceId',
    how='left'
)

# --- 2) compute per‐row weightedScore ---
df = df.withColumn(
    'weightedScore',
    col('datasourceWeight') * col('score')
)

# --- 3) assign ordIndex by ranking weightedScore desc within each group ---
grp_cols = ['diseaseId','targetId','approvedSymbol','datatypeId']
order_win = Window.partitionBy(*grp_cols).orderBy(col('weightedScore').desc())

df = df.withColumn(
    'ordIndex',
    row_number().over(order_win)
)

# --- 4) count number of rows (datasources) per group ---
count_win = Window.partitionBy(*grp_cols)
df = df.withColumn(
    'nDs',
    F.count('*').over(count_win)
)

# --- 5) precompute Σ_{k=1..n} 1/k² for n=1..20, else 1.644 ---
theo = {i: sum(1.0/j**2 for j in range(1,i+1)) for i in range(1,21)}
broadcast_theo = sc.broadcast(theo)

@udf(DoubleType())
def max_theoretical(n):
    if n is None:
        return None
    return float(broadcast_theo.value[n]) if n <= 20 else 1.644

df = df.withColumn(
    'maxTheo',
    max_theoretical(col('nDs'))
)

# --- 6) scale each row by 1/ordIndex² and sum, then divide by maxTheo ---
df = df.withColumn(
    'scaled',
    col('weightedScore') / (col('ordIndex')**2)
)

result = (
    df
    .groupBy(*grp_cols)
    .agg(
        (F.sum('scaled') / F.first('maxTheo')).alias('hSum')
    )
)

# result now has: diseaseId, targetId, approvedSymbol, datatypeId, hSum


In [28]:
result.sort(-col("hSum")).filter(col("hSum")<0.8).show(5)

[Stage 74:>                                                         (0 + 2) / 2]

+-------------+---------------+--------------+-------------------+------------------+
|    diseaseId|       targetId|approvedSymbol|         datatypeId|              hSum|
+-------------+---------------+--------------+-------------------+------------------+
|MONDO_0002149|ENSG00000111276|        CDKN1B|genetic_association|0.7999966200371057|
|  EFO_0005952|ENSG00000171456|         ASXL1|   somatic_mutation|0.7999807407658193|
|  EFO_1000218|ENSG00000130396|          AFDN|   somatic_mutation|0.7999773630884303|
|MONDO_0002516|ENSG00000169184|           MN1|   somatic_mutation|0.7999730053348342|
|  EFO_0003859|ENSG00000100697|        DICER1|   somatic_mutation|0.7999362531293202|
+-------------+---------------+--------------+-------------------+------------------+
only showing top 5 rows



                                                                                