Prepare gene sets from facets obtained from gs://open-targets-pre-data-releases/25.06/view/search_facet_target/

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/26 14:52:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/26 14:52:57 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
facets = spark.read.parquet("/Users/polina/Pathwaganda/data/search_facet_target")

In [4]:
facets.select("category").distinct().show(truncate=False)

+-----------------------------+
|category                     |
+-----------------------------+
|Approved Name                |
|Approved Symbol              |
|Target ID                    |
|GO:CC                        |
|GO:MF                        |
|GO:BP                        |
|Subcellular Location         |
|Tractability PROTAC          |
|Tractability Other Modalities|
|Tractability Antibody        |
|Reactome                     |
|Tractability Small Molecule  |
|ChEMBL Target Class          |
+-----------------------------+



                                                                                

In [7]:
facets.filter(col("category") == "Subcellular Location").show()

+--------------------+--------------------+--------------------+------------+
|               label|            category|           entityIds|datasourceId|
+--------------------+--------------------+--------------------+------------+
|            Acrosome|Subcellular Location|[ENSG00000176988,...|        NULL|
|     Actin filaments|Subcellular Location|[ENSG00000118407,...|     SL-0090|
|           Aggresome|Subcellular Location|[ENSG00000004975,...|        NULL|
|             Annulus|Subcellular Location|[ENSG00000130363,...|        NULL|
|Apical cell membrane|Subcellular Location|[ENSG00000086548,...|     SL-0015|
|Apicolateral cell...|Subcellular Location|[ENSG00000137860,...|     SL-0017|
|        Autolysosome|Subcellular Location|[ENSG00000087086,...|     SL-0535|
|Autolysosome memb...|Subcellular Location|   [ENSG00000225190]|     SL-0536|
|          Basal body|Subcellular Location|[ENSG00000248144,...|        NULL|
| Basal cell membrane|Subcellular Location|[ENSG00000134538,...|

In [16]:
import pandas as pd

def export_gmt_files(spark_df, categories, output_dir):
    df = spark_df.withColumn(
        "Term", 
        F.concat_ws("", F.col("label"), F.lit("["), F.col("datasourceId"), F.lit("]"))
    )

    for cat in categories:
        df_cat = df.filter(F.col("category") == cat)
        df_cat = df_cat.withColumn("entityId", F.explode("entityIds"))

        df_grouped = (df_cat.groupBy("Term")
                             .agg(F.collect_list("entityId").alias("entities")))

        # Convert Spark -> Pandas
        pdf = df_grouped.toPandas()

        # Format into GMT lines
        pdf["gmt_line"] = pdf.apply(
            lambda row: row["Term"] + "\t" + "\t".join(row["entities"]),
            axis=1
        )

        # Save as a single .gmt file
        output_path = f"{output_dir}/{cat}.gmt"
        pdf["gmt_line"].to_csv(output_path, index=False, header=False)


In [17]:
categories = ["Reactome", "GO:BP", "GO:MF", "GO:CC", "Subcellular Location", "ChEMBL Target Class"]
export_gmt_files(facets, categories, "../data/gmt/from_facets")