# Target-disease genetic evidence from Open Targets Platform

This code is meant to prepare ranked lists of genes for all diseases from Open Targets platform with amount of genetically supported genes (genetically and somatic mutations for oncological traits) >= 500

In [None]:
import argparse
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.functions import (
    col, countDistinct, row_number, sum as spark_sum,
    broadcast, array_contains
)
import gcsfs

ModuleNotFoundError: No module named 'pyspark'

In [3]:
weights = {
    # "ot_genetics_portal": 1,
    "gwas_credible_sets": 1,
    "gene_burden": 1,
    "eva": 1,
    "genomics_england": 1,
    "gene2phenotype": 1,
    "uniprot_literature": 1,
    "uniprot_variants": 1,
    "orphanet": 1,
    "clingen": 1,
    "cancer_gene_census": 1,
    "intogen": 1,
    "eva_somatic": 1,
    "cancer_biomarkers": 1,
    "chembl": 1,
    "crispr_screen": 1,
    "crispr": 1,
    "slapenrich": 0.5,
    "progeny": 0.5,
    "reactome": 1,
    "sysbio": 0.5,
    "europepmc": 0.2,
    "expression_atlas": 0.2,
    "impc": 0.2,
    "ot_crispr_validation": 0.5,
    "ot_crispr": 0.5,
    "encore": 0.5,
}

In [None]:
# Oncology MONDO code constant
ONCOLOGY_ID = "MONDO_0045024"


def process_oncology(
    evidence_sour: DataFrame,
    weights: dict,
    disease_df: DataFrame,
    output_dir: str
):
    """
    Process oncology diseases (contain ONCOLOGY_ID) and include somatic_mutation evidence.
    """
    spark = evidence_sour.sparkSession
    # Broadcast weights for efficient lookup
    weights_df = spark.createDataFrame(
        list(weights.items()), schema=["datasourceId", "weight"]
    )

    # Filter to oncology diseases and evidence types
    ev = (
        evidence_sour
        .join(broadcast(disease_df), on="diseaseId")
        .filter(array_contains(col("therapeuticArea"), ONCOLOGY_ID))
        .filter(col("datatypeId").rlike("genetic_association|animal_model|somatic_mutation"))
    )

    # Keep diseases with >=500 unique genes
    valid = (
        ev.groupBy("diseaseId")
          .agg(countDistinct("approvedSymbol").alias("nGenes"))
          .filter(col("nGenes") >= 500)
    )

    # Join back valid diseases and compute weighted scores
    df = (
        ev.join(valid.select("diseaseId"), on="diseaseId")
          .join(broadcast(weights_df), on="datasourceId", how="left")
          .withColumn("score_weighted", col("score") * col("weight"))
    )

    # First-level window aggregation: per (disease, datatype, gene, target)
    win1 = Window.partitionBy(
        "diseaseId", "datatypeId", "approvedSymbol", "targetId"
    ).orderBy(col("score_weighted").desc())

    df1 = (
        df.withColumn("rank1", row_number().over(win1))
          .withColumn("term1", col("score_weighted") / (col("rank1") ** 2))
          .groupBy("diseaseId", "datatypeId", "approvedSymbol", "targetId")
          .agg(spark_sum("term1").alias("sourceSum"))
    )

    # Second-level window aggregation: per (disease, gene, target)
    win2 = Window.partitionBy(
        "diseaseId", "approvedSymbol", "targetId"
    ).orderBy(col("sourceSum").desc())

    df2 = (
        df1.withColumn("rank2", row_number().over(win2))
           .withColumn("overallScore", col("sourceSum") / (col("rank2") ** 2))
           .filter(col("overallScore").isNotNull())
    )

    # Write Parquet files partitioned by diseaseId under oncology subdir
    df2.select("diseaseId", "approvedSymbol", "targetId", "overallScore")
       .repartition("diseaseId")
       .write.mode("overwrite")
       .partitionBy("diseaseId")
       .parquet(f"{output_dir}/oncology")

In [None]:
def process_non_oncology(
    evidence_sour: DataFrame,
    weights: dict,
    disease_df: DataFrame,
    output_dir: str
):
    """
    Process non-oncology diseases (not containing ONCOLOGY_ID) and exclude somatic_mutation evidence.
    """
    spark = evidence_sour.sparkSession
    # Broadcast weights
    weights_df = spark.createDataFrame(
        list(weights.items()), schema=["datasourceId", "weight"]
    )

    # Filter to non-oncology diseases and evidence types
    ev = (
        evidence_sour
        .join(broadcast(disease_df), on="diseaseId")
        .filter(~array_contains(col("therapeuticArea"), ONCOLOGY_ID))
        .filter(col("datatypeId").rlike("genetic_association|animal_model"))
    )

    # Keep diseases with >=500 unique genes
    valid = (
        ev.groupBy("diseaseId")
          .agg(countDistinct("approvedSymbol").alias("nGenes"))
          .filter(col("nGenes") >= 500)
    )

    # Join back and compute weighted scores
    df = (
        ev.join(valid.select("diseaseId"), on="diseaseId")
          .join(broadcast(weights_df), on="datasourceId", how="left")
          .withColumn("score_weighted", col("score") * col("weight"))
    )

    # First-level window aggregation
    win1 = Window.partitionBy(
        "diseaseId", "datatypeId", "approvedSymbol", "targetId"
    ).orderBy(col("score_weighted").desc())

    df1 = (
        df.withColumn("rank1", row_number().over(win1))
          .withColumn("term1", col("score_weighted") / (col("rank1") ** 2))
          .groupBy("diseaseId", "datatypeId", "approvedSymbol", "targetId")
          .agg(spark_sum("term1").alias("sourceSum"))
    )

    # Second-level window aggregation
    win2 = Window.partitionBy(
        "diseaseId", "approvedSymbol", "targetId"
    ).orderBy(col("sourceSum").desc())

    df2 = (
        df1.withColumn("rank2", row_number().over(win2))
           .withColumn("overallScore", col("sourceSum") / (col("rank2") ** 2))
           .filter(col("overallScore").isNotNull())
    )

    # Write Parquet files partitioned by diseaseId under non_oncology subdir
    df2.select("diseaseId", "approvedSymbol", "targetId", "overallScore")
       .repartition("diseaseId")
       .write.mode("overwrite")
       .partitionBy("diseaseId")
       .parquet(f"{output_dir}/non_oncology")

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process disease evidence by area.")
    parser.add_argument(
        "--evidence-path", required=True,
        help="GCS path to association_by_datasource_indirect parquet file"
    )
    parser.add_argument(
        "--target-path", required=True,
        help="GCS path to target parquet file"
    )
    parser.add_argument(
        "--disease-path", required=True,
        help="GCS path to disease-to-area parquet file"
    )
    parser.add_argument(
        "--output-dir", required=True,
        help="GCS output prefix for Parquet files"
    )
    args = parser.parse_args()

    spark = SparkSession.builder.appName("ProcessDiseases").getOrCreate()

    # Load evidence and enrich with gene names
    evidence = spark.read.parquet(args.evidence_path)
    target_df = spark.read.parquet(args.target_path).select(
        col("id").alias("targetId"), col("approvedSymbol")
    )
    evidence = evidence.join(
        broadcast(target_df), on="targetId", how="left"
    )

    # Load disease-to-area mapping
    disease_df = spark.read.parquet(args.disease_path).select(
        col("id").alias("diseaseId"), col("therapeuticArea")
    )

    # Define your data source weights mapping
    weights = {
        # "datasourceA": 1.0,
        # "datasourceB": 0.5,
        # ...
    }

    # Execute processing pipelines
    process_oncology(evidence, weights, disease_df, args.output_dir)
    process_non_oncology(evidence, weights, disease_df, args.output_dir)

    spark.stop()