In [23]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import Row
from pyspark.sql.functions import col, broadcast, when, max, expr, collect_list, concat_ws, array_contains, split
from pyspark.sql.types import BooleanType

In [24]:
spark = SparkSession.builder.getOrCreate()

In [25]:
# Bioactivity data from ChEMBL filtered (exact protein/homolog, assay type != P or U, human targets)
input_path = "gs://ot-team/polina/uniprot_to_class"
input = spark.read.parquet(input_path)

In [26]:
def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    # Performing the left join
    joined_df = initial_df.join(second_df_selected, 
                                initial_df[initial_key_column] == second_df_selected[second_key_column], 
                                how='left')

    # Drop the second key column if not needed
    joined_df = joined_df.drop(second_df_selected[second_key_column])

    return joined_df


In [27]:
def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [28]:
def show_unique_values_and_counts(df: DataFrame, column_name: str):
    """
    Shows unique values and their counts for a specified column in a Spark DataFrame.

    Parameters:
    df (DataFrame): The Spark DataFrame to analyze.
    column_name (str): The name of the column for which to count unique values.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    unique_values_counts = df.groupBy(column_name).count()
    unique_values_counts.show()


## Drug activity threshold

### based on pchembl_value and proteinClass

In [29]:
input.show()

[Stage 49:>                                                         (0 + 1) / 1]

+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-----------------+--------------+--------------+-----------------+---------------+--------------------+----------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|    standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|
+-------------+--------------+-------------------------+----------+--------------------+--------------------

                                                                                

In [30]:
input_exmp = input.filter(input["drugId"] == "CHEMBL20")
input_exmp.show()

+--------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+--------------+---------+------------+
|  drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|   target_type|accession|proteinClass|
+--------+--------------+-------------------------+----------+--------------------+--------------------+---------------+--------

                                                                                

## Dataset with only pchembl values

In [18]:
# Check how many assays have pchembl_value
pchembl_value_only = input.filter(input["pchembl_value"].isNotNull())
pchembl_value_only_n = pchembl_value_only.count()
d_t_n = input.count()

print("Number of drug-target pairs:", d_t_n)
print("Number of drug-target pairs with pchembl_value:", pchembl_value_only_n)



Number of drug-target pairs: 18860
Number of drug-target pairs with pchembl_value: 6250


                                                                                

### For each T-D pairs make new columns:
####     1. max_pchembl_value
####     2. median_pchembl_value

In [19]:
# Original aggregation with additional count of non-null pchembl_values
pchembl_value_aggr = pchembl_value_only.groupBy("target_chembl_id", "drugId")\
                  .agg(f.max("pchembl_value").alias("max_pchembl_value"),
                       f.expr("percentile_approx(pchembl_value, 0.5)").alias("median_pchembl_value"))

# Join original data with filtered aggregated data
pchembl_value_join = pchembl_value_aggr.join(input, ["target_chembl_id", "drugId"], "left_outer")

In [20]:
# Leave only unique dug-target pairs with pchembl value

# Group by 'target_chembl_id' and 'drugId', and aggregate 'pchembl_value'
pchembl_value_concat = pchembl_value_join.groupBy("target_chembl_id", "drugId")\
                        .agg(concat_ws(", ", collect_list("pchembl_value")).alias("pchembl_value_aggr"))

pchembl_value_drop = pchembl_value_join.drop("standard_type", "standard_units", "standard_value", "standard_relation", "pchembl_value")
pchembl_value_uniq = pchembl_value_drop.dropDuplicates(["target_chembl_id", "drugId"])

pchembl_value_concat_join = pchembl_value_concat.join(pchembl_value_uniq, ["target_chembl_id", "drugId"], "left_outer")

pchembl_value_concat_join.show()

23/12/18 14:43:02 WARN MemoryStore: Not enough space to cache broadcast_28 in memory! (computed 466.5 MiB so far)
23/12/18 14:43:02 WARN BlockManager: Persisting block broadcast_28 to disk instead.
23/12/18 14:43:28 WARN MemoryStore: Not enough space to cache broadcast_28 in memory! (computed 466.5 MiB so far)


+----------------+-------------+------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+--------------+---------+------------+
|target_chembl_id|       drugId|pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|   target_type|accession|proteinClass|
+----------------+-------------+------------------+-----------------+--------------------+--------------+-------------------------+----------+------------

23/12/18 14:43:28 WARN MemoryStore: Not enough space to cache broadcast_28 in memory! (computed 466.5 MiB so far)
                                                                                

### Activity cutoff for max_pchembl_value and median_pchembl_value

In [21]:
max_pchembl_value = col("max_pchembl_value")
med_pchembl_value = col("median_pchembl_value")

pchembl_max_activity = pchembl_value_concat_join.withColumn(
    "isActive_max",
    when(
        ((col("proteinClass") == "Kinase") & (max_pchembl_value >= 7.7)) |
        ((col("proteinClass") == "GPCR") & (max_pchembl_value >= 6.5)) |
        ((col("proteinClass") == "NR") & (max_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Transporter") & (max_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Enzyme") & (max_pchembl_value >= 5.2)) |
        ((col("proteinClass") == "IC") & (max_pchembl_value >= 4.6)) |
        ((col("proteinClass") == "Other") & (max_pchembl_value >= 6.3)) |
        (~(col("proteinClass").isin(["Kinase", "GPCR", "NR", "Transporter", "Enzyme", "IC", "Other"])) & (max_pchembl_value >= 5)),
        "TRUE"
    ).otherwise("FALSE")
)

pchembl_activity = pchembl_max_activity.withColumn(
    "isActive_med",
    when(
        ((col("proteinClass") == "Kinase") & (med_pchembl_value >= 7.7)) |
        ((col("proteinClass") == "GPCR") & (med_pchembl_value >= 6.5)) |
        ((col("proteinClass") == "NR") & (med_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Transporter") & (med_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Enzyme") & (med_pchembl_value >= 5.2)) |
        ((col("proteinClass") == "IC") & (med_pchembl_value >= 4.6)) |
        ((col("proteinClass") == "Other") & (med_pchembl_value >= 6.3)) |
        (~(col("proteinClass").isin(["Kinase", "GPCR", "NR", "Transporter", "Enzyme", "IC", "Other"])) & (med_pchembl_value >= 5)),
        "TRUE"
    ).otherwise("FALSE")
)

In [22]:
max_active = pchembl_activity.filter(col("isActive_max") == True).count()
med_active = pchembl_activity.filter(col("isActive_med") == True).count()
all_pchembl = pchembl_activity.count()

print("Number of drug-target pairs with pchembl values: ", all_pchembl)
print("Number of drug-target pairs with active drugs based on max pchembl values: ", max_active)
print("Number of drug-target pairs with active drugs based on med pchembl values: ", med_active)



Number of drug-target pairs with pchembl values:  4647
Number of drug-target pairs with active drugs based on max pchembl values:  2473
Number of drug-target pairs with active drugs based on med pchembl values:  2373


                                                                                

## Dataset without pchembl values

In [None]:
# Need to exclude rows for which pchembl was found

In [None]:
# Where pchembl_value is not available for each T-D pairs make new columns:
#     1. Calculate how much data is this
#     2. Think about what to do with different standard_units
#     3. Ideally:
#         1. max_standard_value_n
#         2. median_standard_value_n
#         3. Cutoff for every n

no_pchembl_value = uniprot_to_class.filter(col("pchembl_value").isNull())
show_unique_values_and_counts(no_pchembl_value, "standard_units")

+--------------+-----+
|standard_units|count|
+--------------+-----+
|          null| 2740|
|     pm/min/mg|  150|
|     degrees C|  139|
|             %| 3123|
|            nM| 6139|
|          /min|    8|
|         /uM/s|    5|
|   nmol/min/mg|    7|
|  mL.min-1.g-1|   16|
|            uM|   78|
|         10'8M|    1|
|            mM|    7|
|     10'-3/min|    1|
|      10'6/M/s|    1|
|      nmol/min|    8|
|            /s|    6|
|          pmol|    3|
|     /s/microM|    1|
|       ug ml-1|    2|
|       ug.mL-1|    4|
+--------------+-----+
only showing top 20 rows



In [None]:
# Make column with activity of molecule: drugActive = TRUE/FALSE
#     1. based protein type and:
#         1. max_pchembl_value
#         2. median_pchembl_value
#     2. based on cutoffs for other experiment types

## Non-pharmacological MoA search

### Target is in MoA of a drug?

In [31]:
mechanism_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_mechanism.jsonl"
mechanism = spark.read.json(mechanism_path)
mechanism.persist()
mechanism.show()

# Rename columns
mechanism_renamed = mechanism.withColumnRenamed("action_type", "action_type_moa")\
                            .withColumnRenamed("target_chembl_id", "target_chembl_id_moa")

[Stage 52:>                                                         (0 + 2) / 2]

+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|       action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL2103825],...|         INHIBITOR|Pancreatic lipase...|[{16953261, PubMe...|     CHEMBL2103825|            CHEMBL2103825|  1699800|      CHEMBL1812|
|{[CHEMBL1200495, ...|           AGONIST|Glucocorticoid re...|[{setid=6d9bf1b0-...|     CHEMBL1200495|                CHEMBL977|  1344612|      CHEMBL2034|
|{[CHEMBL3544919],...|SEQUESTERING AGENT|Heparin sequester...|[{26937198, PubMe...|     CHEMBL3544919|            CHEMBL3544919|  2473107|   CHEMBL2364712|
|{[CHEMBL3989993],...|         INHIBITOR|microRNA-155 inhi...|[{

                                                                                

In [32]:
# List of columns from target table
list_mechanism = ["action_type_moa",
            "target_chembl_id_moa"]

drug_to_moa = join_dataframes(pchembl_activity, mechanism_renamed, "drugId", "parent_molecule_chembl_id", list_mechanism).persist()

In [33]:
# Concat moa to 1 row
drug_to_moa_concat = drug_to_moa.groupBy("target_chembl_id", "drugId")\
    .agg(concat_ws(", ", array_distinct(collect_list("target_chembl_id_moa"))).alias("target_chembl_id_moa_aggr"))

drug_to_moa_drop = drug_to_moa.drop("target_chembl_id_moa")
drug_to_moa_uniq = drug_to_moa_drop.dropDuplicates(["target_chembl_id", "drugId"])

drug_to_moa_join = drug_to_moa_concat.join(drug_to_moa_uniq, ["target_chembl_id", "drugId"], "left_outer")

In [34]:
drug_to_moa_join.show()

23/12/18 14:46:20 WARN MemoryStore: Not enough space to cache broadcast_77 in memory! (computed 466.5 MiB so far)
23/12/18 14:46:20 WARN BlockManager: Persisting block broadcast_77 to disk instead.
23/12/18 14:46:21 WARN MemoryStore: Not enough space to cache broadcast_77 in memory! (computed 466.5 MiB so far)
23/12/18 14:46:22 WARN BlockManager: Block rdd_192_0 could not be removed as it was not found on disk or in memory
23/12/18 14:46:22 ERROR Executor: Exception in task 0.0 in stage 61.0 (TID 172)
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:64)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:363)
	at org.apache.spark.sql.execution.columnar.BasicColumnBuilder.build(ColumnBuilder.scala:81)
	at org.apache.spark.sql.execution.columnar.ComplexColumnBuilder.org$apache$spark$sql$execution$columnar$NullableColumnBuilder$$super$build(ColumnBuilder.scala:93)
	at org.apache.spark.sql.execution.columnar.NullableColumnB

Py4JError: py4j.reflection does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/polina/Library/Python/3.9/lib/python/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/polina/Library/Python/3.9/lib/python/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/polina/Library/Python/3.9/lib/python/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [None]:
drug_to_moa_join.count()

                                                                                

4647

In [None]:
mechanism_filtered = mechanism.filter(mechanism["parent_molecule_chembl_id"] == "CHEMBL3545181")
mechanism_filtered.show()

+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL1200710, ...|  INHIBITOR|Serotonin transpo...|[{setid=4074b555-...|     CHEMBL1200710|                CHEMBL415|  1343314|       CHEMBL228|
+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+



In [None]:
drug_to_moa_filtered = drug_to_moa_join.filter(drug_to_moa_join["drugId"] == "CHEMBL3545181")
drug_to_moa_filtered.show()

                                                                                

+----------------+---------+-------------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+--------------+---------+------------+
|target_chembl_id|   drugId|target_chembl_id_moa_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|   target_type|accession|proteinClass|
+----------------+---------+-------------------------+-----------------+--------------------+--------------+-------------------------+----------+---

In [None]:
drug_to_moa_join.printSchema()

root
 |-- target_chembl_id: string (nullable = true)
 |-- drugId: string (nullable = true)
 |-- target_chembl_id_moa_aggr: string (nullable = false)
 |-- max_pchembl_value: string (nullable = true)
 |-- median_pchembl_value: double (nullable = true)
 |-- drugType: string (nullable = true)
 |-- maximumClinicalTrialPhase: double (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- linkedTargets: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- linkedDiseases: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- assay_chembl_id: string (nullable = true)
 |-- assay_type: string (nullable = true)
 |-- action_type: struct (nullable = true)
 |    |-- action_type: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- parent_type: st

In [None]:
# Add a new column 'isInMoA'

drug_to_moa_isInMoA = drug_to_moa_join.withColumn(
    "isInMoA", 
    when(
        col("target_chembl_id_moa_aggr") == "", None  # Check for empty string
    ).otherwise(
        array_contains(split(col("target_chembl_id_moa_aggr"), ","), col("target_chembl_id"))
    )
)

drug_to_moa_isInMoA.count()

4647

### Targets: Evidence type: sources + classification for GE, clinical_GE, probes (boolean)

In [None]:
# Irene's table
evidence_path = "gs://ot-team/irene/drug_to_target"
evidence = spark.read.parquet(evidence_path)
evidence.persist()
evidence.show()

[Stage 271:>                                                        (0 + 1) / 1]

+----------+---------+---------------+--------------------+------------------+-------------------+
|    drugId|uniprotId|       targetId|             sources|isHighQualityProbe|isTherapeuticTarget|
+----------+---------+---------------+--------------------+------------------+-------------------+
|CHEMBL1000|   O00167|ENSG00000064655|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O00555|ENSG00000141837|[uniprot_literatu...|             false|              false|
|CHEMBL1000|   O14633|ENSG00000159455|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O60706|ENSG00000069431|            [chembl]|             false|               true|
|CHEMBL1000|   P00352|ENSG00000165092|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   P01567|ENSG00000214042|            [chembl]|             false|               true|
|CHEMBL1000|   P04155|ENSG00000160182|[ot_genetics_portal]|             false|              false|
|CHEMBL100

                                                                                

In [None]:
def join_dataframes_by_many_cols(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_columns: list, 
                    second_key_columns: list,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_columns (list): The key column names in the initial DataFrame.
    second_key_columns (list): The key column names in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Ensure the key columns lists have the same length
    if len(initial_key_columns) != len(second_key_columns):
        raise ValueError("Key columns lists must be of the same length")

    # Selecting specified columns from the second DataFrame, including its key columns
    second_df_selected = second_df.select(second_key_columns + columns_to_join)

    # Build join condition
    join_condition = [initial_df[initial_col] == second_df_selected[second_col] 
                      for initial_col, second_col in zip(initial_key_columns, second_key_columns)]

    # Perform the left join
    joined_df = initial_df.join(second_df_selected, 
                                on=join_condition, 
                                how='left')

    # Drop the second key columns if not needed
    for col in second_key_columns:
        joined_df = joined_df.drop(second_df_selected[col])

    return joined_df


In [None]:
# Join by drugId and uniprotId = accession
evidence_list = ["targetId", "sources", "isHighQualityProbe", "isTherapeuticTarget"]

target_evidence = join_dataframes_by_many_cols(drug_to_moa_isInMoA, 
                                            evidence, 
                                            ["drugId", "accession"], 
                                            ["drugId", "uniprotId"], 
                                             evidence_list).persist()
target_evidence.show()

23/12/17 02:48:43 WARN MemoryStore: Not enough space to cache broadcast_332 in memory! (computed 6.6 GiB so far)
23/12/17 02:48:43 WARN BlockManager: Persisting block broadcast_332 to disk instead.
23/12/17 02:49:30 WARN MemoryStore: Not enough space to cache broadcast_332 in memory! (computed 6.6 GiB so far)
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_172 !
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_124 !
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_90 !
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_87 !
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_117 !
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_105 !
23/12/17 02:49:49 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_514_177 !
23/12/17 02:49:49 W

+----------------+-------------+-------------------------+--------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+------------+------------+---------------+-------+---------------+--------------------+------------------+-------------------+
|target_chembl_id|       drugId|target_chembl_id_moa_aggr|  pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|    target

                                                                                

In [None]:
target_evidence.count()

                                                                                

4656

In [None]:
# Classification of targets by genetic evidence support
def is_ge(sources):
    undesired_lists = [
        ['chembl'],
        ['chemicalProbes'],
        ['chembl', 'chemicalProbes'],
        ['chemicalProbes', 'chembl']
    ]
    return not (sources in undesired_lists or sources is None)

def is_ge_clinical(sources):
    undesired_lists = [['chemicalProbes']]
    return not (sources in undesired_lists or sources is None)

def contains_chemical_probes(sources):
    return 'chemicalProbes' in sources if sources else False

# Register UDFs
is_ge_udf = udf(is_ge, BooleanType())
is_ge_clinical_udf = udf(is_ge_clinical, BooleanType())
contains_chemical_probes_udf = udf(contains_chemical_probes, BooleanType())

# Apply UDFs to create new columns
target_evidence_bool = target_evidence.withColumn("isGE", is_ge_udf("sources"))\
                        .withColumn("isGE_clinical", is_ge_clinical_udf("sources"))\
                        .withColumn("isProbe", contains_chemical_probes_udf("sources"))

target_evidence_bool.show()
target_evidence_bool.count()

                                                                                

+----------------+-------------+-------------------------+--------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+------------+------------+---------------+-------+---------------+--------------------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|target_chembl_id_moa_aggr|  pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   

4656

In [None]:
target_evidence_bool.write.parquet("gs://ot-team/polina/target_evidence_bool")

                                                                                

### Counts targets with non-pharmacological MoA

In [None]:
# How many targets:
#   have assays for active drugs 
#   not in MoA of these drugs
#   supported by GE or clinical evidence

target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["IsActive_max"] == True)\
                        .filter(target_evidence_bool["isInMoA"].isNull())\
                        .filter(target_evidence_bool["isGE_clinical"] == True)\

count_unique_values(target_evidence_no_moa.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

135

In [None]:
# How many targets:
#   have assays for active drugs 
#   not in MoA of these drugs
#   supported by GE

target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["IsActive_max"] == True)\
                        .filter(target_evidence_bool["isInMoA"].isNull())\
                        .filter(target_evidence_bool["isGE"] == True)\

count_unique_values(target_evidence_no_moa.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

83

In [None]:
# How many targets:
#   have assays for active drugs (pchembl median)
#   not in MoA of these drugs
#   supported by GE or clinical evidence

target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["IsActive_med"] == True)\
                        .filter(target_evidence_bool["isInMoA"].isNull())\
                        .filter(target_evidence_bool["isGE_clinical"] == True)\

count_unique_values(target_evidence_no_moa.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

133

In [None]:
target_evidence_no_moa.count()

                                                                                

239

In [None]:
# No targets in MoA for probes
target_evidence_no_moa_probes = target_evidence_bool\
                        .filter(target_evidence_bool["target_chembl_id_moa"].isNull())\
                        .filter(target_evidence_bool["IsActive"] == True)\
                        .filter(target_evidence_bool["isProbe"] == True)
count_unique_values(target_evidence_no_moa_probes, "drugId")

                                                                                

438

In [None]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

483

In [None]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

131

In [None]:
count_unique_values(target_evidence_bool, "drugId")

                                                                                

2287

In [None]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] != "true")
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

596

In [None]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] == True)
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

                                                                                

850

In [None]:
from pyspark.sql.functions import countDistinct

# Group by 'isApproved' and count distinct 'drugId's
drugId_count = target_evidence_bool.groupBy("isApproved").agg(countDistinct("drugId").alias("unique_drugId_count"))

# Show the result
drugId_count.show()




+----------+-------------------+
|isApproved|unique_drugId_count|
+----------+-------------------+
|      null|               1160|
|      true|               1318|
|     false|                895|
+----------+-------------------+



                                                                                

In [None]:
target_evidence_bool_probe_h = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True)
# target_evidence_bool_probe_h.show()
count_unique_values(target_evidence_bool_probe_h, "drugId")

187

In [None]:
target_evidence_bool_probe = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True)
# target_evidence_bool_probe.show()
count_unique_values(target_evidence_bool_probe, "drugId")

                                                                                

695

# Data coverage

### Drugs

In [None]:
# Filtering by:
#         1. max_phase ≠ 4 | max_phase = 4 | probes = TRUE
#         2. moa = NaN | moa ≠ NaN
#         3. drugActive = TRUE
#         4. GE = TRUE | clinical_GE = TRUE

In [None]:
# Number of clinical candidates/approved drugs/chemical probes for which:
#     1. there is no MoA and they are bioactive against some targets:
#         1. which have GE/GE+clinical evidence for any disease
#     2. there is MoA but they are bioactive against some other targets
#         1. which have GE/GE+clinical evidence for any disease

### Targets

In [None]:
# Dataset with only pchembl value activity
target_evidence_bool.show()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

In [None]:
show_unique_values_and_counts(target_evidence_bool, "drugType")

+---------------+-----+
|       drugType|count|
+---------------+-----+
| Small molecule| 9218|
|Oligosaccharide|    2|
|           null|    1|
|        Protein|  113|
|       Antibody|    4|
|        Unknown|   14|
+---------------+-----+



In [None]:
# Number of targets supported by GE which have active bioassays

targets_GE = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)

count_unique_values(targets_GE, "target_chembl_id")

                                                                                

326

In [None]:
# Number of targets supported by GE_clinical which have active bioassays

targets_GE_clinical = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)

count_unique_values(targets_GE_clinical, "target_chembl_id")

                                                                                

431

In [None]:
# Number of targets supported by GE which have active bioassays and not in MoA of the drug

targets_GE_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_noMoA, "target_chembl_id")

                                                                                

115

In [None]:
# Number of targets supported by GE_clinical which have active bioassays and not in MoA of the drug

targets_GE_clinical_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_clinical_noMoA, "target_chembl_id")

                                                                                

145

## Action type

In [None]:
# Taking table target_organism_filter (after pchembl filter)

target_evidence_bool.show()
target_evidence_bool.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

9352

In [None]:
count_unique_values(target_evidence_bool, "drugId")

2287

In [None]:
show_unique_values_and_counts(target_evidence_bool.drop_duplicates(["drugId", "action_type_moa"]), "action_type_moa")

+--------------------+-----+
|     action_type_moa|count|
+--------------------+-----+
|  NEGATIVE MODULATOR|    1|
|NEGATIVE ALLOSTER...|    4|
|          ANTAGONIST|  166|
|                null| 1442|
|       BINDING AGENT|    1|
|     PARTIAL AGONIST|    8|
|           ACTIVATOR|    1|
|             BLOCKER|   42|
|    DISRUPTING AGENT|    4|
|           INHIBITOR|  491|
|           SUBSTRATE|    1|
|POSITIVE ALLOSTER...|    8|
|     CHELATING AGENT|    1|
|  POSITIVE MODULATOR|    4|
|             AGONIST|  117|
|              OPENER|    5|
|     INVERSE AGONIST|    4|
|     RELEASING AGENT|    1|
|           MODULATOR|    8|
|      REDUCING AGENT|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
show_unique_values_and_counts(drug_to_moa, "action_type")

+--------------------+------+
|         action_type| count|
+--------------------+------+
|                null|211437|
|{INHIBITOR, Negat...|   124|
|{SUBSTRATE, Carri...|    47|
|{ANTAGONIST, Bind...|    41|
|{INVERSE AGONIST,...|     2|
|{AGONIST, Binds t...|     4|
|{ACTIVATOR, Posit...|     1|
+--------------------+------+



### Drug-Target pairs

In [None]:
# With non pharmacological action
