# Target-disease genetic evidence from Open Targets Platform

This code is meant to prepare ranked lists of genes for all diseases from Open Targets platform with amount of genetically supported genes (genetically and somatic mutations for oncological traits) >= 500

In [4]:
from pyspark.sql import SparkSession, DataFrame, Window
from pyspark.sql.functions import (
    col, countDistinct, row_number, sum as spark_sum,
    broadcast, array_contains
)
import gcsfs

ModuleNotFoundError: No module named 'pyspark'

In [3]:
weights = {
    # "ot_genetics_portal": 1,
    "gwas_credible_sets": 1,
    "gene_burden": 1,
    "eva": 1,
    "genomics_england": 1,
    "gene2phenotype": 1,
    "uniprot_literature": 1,
    "uniprot_variants": 1,
    "orphanet": 1,
    "clingen": 1,
    "cancer_gene_census": 1,
    "intogen": 1,
    "eva_somatic": 1,
    "cancer_biomarkers": 1,
    "chembl": 1,
    "crispr_screen": 1,
    "crispr": 1,
    "slapenrich": 0.5,
    "progeny": 0.5,
    "reactome": 1,
    "sysbio": 0.5,
    "europepmc": 0.2,
    "expression_atlas": 0.2,
    "impc": 0.2,
    "ot_crispr_validation": 0.5,
    "ot_crispr": 0.5,
    "encore": 0.5,
}

In [None]:
# Oncology MONDO code constant
ONCOLOGY_ID = "MONDO_0045024"

# Initialize Spark
spark = SparkSession.builder.appName("ProcessDiseasesNotebook").getOrCreate()

# 1. Load inputs by specifying your GCS paths directly
evidence_path = "gs://your-bucket/association_by_datasource_indirect"
target_path   = "gs://your-bucket/targets"
disease_path  = "gs://your-bucket/disease_to_area"
output_dir    = "gs://your-bucket/processed"
include_animal_models = True   # or False

# 2. Read and prepare DataFrames

evidence = spark.read.parquet(evidence_path)
target_df = (
    spark.read.parquet(target_path)
         .select(col("id").alias("targetId"), col("approvedSymbol"))
)
evidence = evidence.join(broadcast(target_df), on="targetId", how="left")

disease_df = (
    spark.read.parquet(disease_path)
         .select(col("id").alias("diseaseId"), col("therapeuticArea"))
)

# 3. Define your data source weights
weights = {
    # "datasourceA": 1.0,
    # "datasourceB": 0.5,
    # ...
}

# 4. Build evidence type lists based on animal-model flag
oncology_types     = ["genetic_association", "somatic_mutation"]
non_oncology_types = ["genetic_association"]
if include_animal_models:
    oncology_types.append("animal_model")
    non_oncology_types.append("animal_model")

# 5. Define processing functions

def _compute_scores(ev: DataFrame, evidence_types: list) -> DataFrame:
    # Build regex from types
    pattern = "|".join(evidence_types)
    # Filter by types and by ≥500 genes
    valid = (
        ev.filter(col("datatypeId").rlike(pattern))
          .groupBy("diseaseId")
          .agg(countDistinct("approvedSymbol").alias("nGenes"))
          .filter(col("nGenes") >= 500)
    )
    weights_df = spark.createDataFrame(
        list(weights.items()), schema=["datasourceId", "weight"]
    )
    df = (
        ev.join(valid.select("diseaseId"), on="diseaseId")
          .join(broadcast(weights_df), on="datasourceId", how="left")
          .withColumn("score_weighted", col("score") * col("weight"))
    )
    # First-level window
    win1 = Window.partitionBy("diseaseId","datatypeId","approvedSymbol","targetId").orderBy(col("score_weighted").desc())
    df1 = (
        df.withColumn("rank1", row_number().over(win1))
          .withColumn("term1", col("score_weighted") / (col("rank1")**2))
          .groupBy("diseaseId","datatypeId","approvedSymbol","targetId")
          .agg(spark_sum("term1").alias("sourceSum"))
    )
    # Second-level window
    win2 = Window.partitionBy("diseaseId","approvedSymbol","targetId").orderBy(col("sourceSum").desc())
    df2 = (
        df1.withColumn("rank2", row_number().over(win2))
           .withColumn("overallScore", col("sourceSum") / (col("rank2")**2))
           .filter(col("overallScore").isNotNull())
    )
    return df2

# 6. Process oncology

ev_onc = (
    evidence.join(broadcast(disease_df), on="diseaseId")
            .filter(array_contains(col("therapeuticArea"), ONCOLOGY_ID))
)
result_onc = _compute_scores(ev_onc, oncology_types)
# Write out
result_onc.select("diseaseId","approvedSymbol","targetId","overallScore") \
           .repartition("diseaseId") \
           .write.mode("overwrite") \
           .partitionBy("diseaseId") \
           .parquet(f"{output_dir}/oncology")

# 7. Process non-oncology

ev_non = (
    evidence.join(broadcast(disease_df), on="diseaseId")
            .filter(~array_contains(col("therapeuticArea"), ONCOLOGY_ID))
)
result_non = _compute_scores(ev_non, non_oncology_types)
# Write out
result_non.select("diseaseId","approvedSymbol","targetId","overallScore") \
           .repartition("diseaseId") \
           .write.mode("overwrite") \
           .partitionBy("diseaseId") \
           .parquet(f"{output_dir}/non_oncology")

print("Processing complete.")
