In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import Row
from pyspark.sql.functions import col, max, expr
from pyspark.sql.functions import when

In [3]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/15 15:18:11 INFO SparkEnv: Registering MapOutputTracker
23/12/15 15:18:11 INFO SparkEnv: Registering BlockManagerMaster
23/12/15 15:18:11 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
23/12/15 15:18:11 INFO SparkEnv: Registering OutputCommitCoordinator


In [4]:
# Bioactivity data from ChEMBL filtered (exact protein/homolog, assay type != P or U, human targets)
input_path = "gs://ot-team/polina/uniprot_to_class"
input = spark.read.parquet(input_path)

                                                                                

In [4]:
def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [5]:
def show_unique_values_and_counts(df: DataFrame, column_name: str):
    """
    Shows unique values and their counts for a specified column in a Spark DataFrame.

    Parameters:
    df (DataFrame): The Spark DataFrame to analyze.
    column_name (str): The name of the column for which to count unique values.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    unique_values_counts = df.groupBy(column_name).count()
    unique_values_counts.show()


## Drug activity threshold

### based on pchembl_value and proteinClass

In [5]:
input.show()

23/12/15 15:18:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-----------------+--------------+--------------+-----------------+---------------+--------------------+----------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|    standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|
+-------------+--------------+-------------------------+----------+--------------------+--------------------

                                                                                

In [7]:
# Check how many assays have pchembl_value
pchembl_value_only = input.filter(input["pchembl_value"].isNotNull())
pchembl_value_only_n = pchembl_value_only.count()
d_t_n = input.count()

print("Number of drug-target pairs:", d_t_n)
print("Number of drug-target pairs with pchembl_value:", pchembl_value_only_n)



Number of drug-target pairs: 18860
Number of drug-target pairs with pchembl_value: 6250


                                                                                

In [43]:
# Where pchembl_value is available for each T-D pairs make new columns:
#     1. max_pchembl_value
#     2. median_pchembl_value

# Group by 'target_chembl_id' and 'drugId' and calculate max and median of 'pchembl_value'
pchembl_value_aggr = input.groupBy("target_chembl_id", "drugId")\
                  .agg(max("pchembl_value").alias("max_pchembl_value"),
                       expr("percentile_approx(pchembl_value, 0.5)").alias("median_pchembl_value"))

pchembl_value_max_med = input.join(pchembl_value_aggr, ["target_chembl_id", "drugId"])\
                    .filter(pchembl_value_aggr["max_pchembl_value"].isNotNull())

In [13]:
# Original aggregation with additional count of non-null pchembl_values
aggregated = input.groupBy("target_chembl_id", "drugId")\
                  .agg(f.max("pchembl_value").alias("max_pchembl_value"),
                       f.expr("percentile_approx(pchembl_value, 0.5)").alias("median_pchembl_value"),
                       f.count(f.when(f.col("pchembl_value").isNotNull(), True)).alias("non_null_count"))

# Filter groups that have at least one non-null pchembl_value
# filtered_aggregated = aggregated.filter(f.col("non_null_count") > 0)

# Join original data with filtered aggregated data
joined_data = aggregated.join(input, ["target_chembl_id", "drugId"], "left_outer")

# joined_data.pchembl_value.isNotNull() | 
# Final result: rows with non-null pchembl_values in groups having at least one non-null value
final_result = joined_data.filter((joined_data["pchembl_value"] == 0) | (joined_data["pchembl_value"].isNull()))
final_result.show()

[Stage 9:>                                                          (0 + 1) / 1]

+----------------+-------------+-----------------+--------------------+--------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+--------------+---------+------------+
|target_chembl_id|       drugId|max_pchembl_value|median_pchembl_value|non_null_count|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_components|   target_type|accession|proteinClass|
+---------

                                                                                

In [None]:
# Custom activity cutoff for max_pchembl_value
drug_active_pchembl_value = pchembl_value_max_med.withColumn(
    "isActive",
    when(
        ((col("proteinClass") == "Kinase") & (col("max_pchembl_value") >= 7.7)) |
        ((col("proteinClass") == "GPCR") & (col("max_pchembl_value") >= 6.5)) |
        ((col("proteinClass") == "NR") & (col("max_pchembl_value") >= 6.1)) |
        ((col("proteinClass") == "Transporter") & (col("max_pchembl_value") >= 6.1)) |
        ((col("proteinClass") == "Enzyme") & (col("max_pchembl_value") >= 5.2)) |
        ((col("proteinClass") == "IC") & (col("max_pchembl_value") >= 4.6)) |
        ((col("proteinClass") == "Other") & (col("max_pchembl_value") >= 6.3)) |
        (~(col("proteinClass").isin(["Kinase", "GPCR", "NR", "Transporter", "Enzyme", "IC", "Other"])) & (col("max_pchembl_value") >= 5)),
        "TRUE"
    ).otherwise("FALSE")
)

In [30]:
print("Number of unique drugs with bioactivity data by approval status:")
show_unique_values_and_counts(target_organism_filter.drop_duplicates(["drugId", "isApproved"]), "isApproved")

Number of unique drugs with bioactivity data by approval status:




+----------+-----+
|isApproved|count|
+----------+-----+
|      null| 1160|
|      true| 1318|
|     false|  895|
+----------+-----+



                                                                                

In [44]:
print("Number of unique ACTIVE drugs with bioactivity data by approval status:")
show_unique_values_and_counts(drug_active_pchembl_value\
                              .drop_duplicates(["drugId", "isApproved"])\
                              .filter(col("isActive") == True), "isApproved")

Number of unique ACTIVE drugs with bioactivity data by approval status:
+----------+-----+
|isApproved|count|
+----------+-----+
|      null|  597|
|      true|  439|
|     false|  339|
+----------+-----+



### no pchembl_value data

In [None]:
# !!! Need to exclude rows for which pchembl was found !!!

In [45]:
# Where pchembl_value is not available for each T-D pairs make new columns:
#     1. Calculate how much data is this
#     2. Think about what to do with different standard_units
#     3. Ideally:
#         1. max_standard_value_n
#         2. median_standard_value_n
#         3. Cutoff for every n

no_pchembl_value = uniprot_to_class.filter(col("pchembl_value").isNull())
show_unique_values_and_counts(no_pchembl_value, "standard_units")

+--------------+-----+
|standard_units|count|
+--------------+-----+
|          null| 2740|
|     pm/min/mg|  150|
|     degrees C|  139|
|             %| 3123|
|            nM| 6139|
|          /min|    8|
|         /uM/s|    5|
|   nmol/min/mg|    7|
|  mL.min-1.g-1|   16|
|            uM|   78|
|         10'8M|    1|
|            mM|    7|
|     10'-3/min|    1|
|      10'6/M/s|    1|
|      nmol/min|    8|
|            /s|    6|
|          pmol|    3|
|     /s/microM|    1|
|       ug ml-1|    2|
|       ug.mL-1|    4|
+--------------+-----+
only showing top 20 rows



In [None]:
# Make column with activity of molecule: drugActive = TRUE/FALSE
#     1. based protein type and:
#         1. max_pchembl_value
#         2. median_pchembl_value
#     2. based on cutoffs for other experiment types

### Drug-target pairs: Target is in MoA of a drug (targetInMoA, boolean)

In [34]:
# # Targets dataset to map accession to Ensembl
# target_path = "gs://open-targets-data-releases/23.12/output/etl/json/targets"
# target = spark.read.json(target_path)
# target.persist()
# target.show()


In [152]:
mechanism_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_mechanism.jsonl"
mechanism = spark.read.json(mechanism_path)
mechanism.persist()
mechanism.show()

+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|       action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL2103825],...|         INHIBITOR|Pancreatic lipase...|[{16953261, PubMe...|     CHEMBL2103825|            CHEMBL2103825|  1699800|      CHEMBL1812|
|{[CHEMBL1200495, ...|           AGONIST|Glucocorticoid re...|[{setid=6d9bf1b0-...|     CHEMBL1200495|                CHEMBL977|  1344612|      CHEMBL2034|
|{[CHEMBL3544919],...|SEQUESTERING AGENT|Heparin sequester...|[{26937198, PubMe...|     CHEMBL3544919|            CHEMBL3544919|  2473107|   CHEMBL2364712|
|{[CHEMBL3989993],...|         INHIBITOR|microRNA-155 inhi...|[{

23/12/11 16:34:31 WARN CacheManager: Asked to cache already cached data.


In [61]:
# # Filtering just to check that molecule_chembl_id can correspond to several targets
# mechanism_filtered = mechanism.filter(mechanism["molecule_chembl_id"] == "CHEMBL1946170")
# mechanism_filtered.show()

+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL1946170],...|  INHIBITOR|Platelet-derived ...|[{REGORAFENIB, Ex...|     CHEMBL1946170|            CHEMBL1946170|  1679381|   CHEMBL2095189|
|{[CHEMBL1946170],...|  INHIBITOR|Tyrosine-protein ...|[{REGORAFENIB, Ex...|     CHEMBL1946170|            CHEMBL1946170|  1679381|      CHEMBL4223|
|{[CHEMBL1946170],...|  INHIBITOR|Discoidin domain-...|[{REGORAFENIB, Ex...|     CHEMBL1946170|            CHEMBL1946170|  1679381|      CHEMBL5122|
|{[CHEMBL1946170],...|  INHIBITOR|Tyrosine-protein ...|[{REGORAFENIB, Ex...|     CHEMBL1946170|           

In [153]:
# Rename columns
mechanism_renamed = mechanism.withColumnRenamed("action_type", "action_type_moa").withColumnRenamed("target_chembl_id", "target_chembl_id_moa")

# Show the result
mechanism_renamed.show()

+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+--------------------+
|           _metadata|   action_type_moa| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id_moa|
+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+--------------------+
|{[CHEMBL2103825],...|         INHIBITOR|Pancreatic lipase...|[{16953261, PubMe...|     CHEMBL2103825|            CHEMBL2103825|  1699800|          CHEMBL1812|
|{[CHEMBL1200495, ...|           AGONIST|Glucocorticoid re...|[{setid=6d9bf1b0-...|     CHEMBL1200495|                CHEMBL977|  1344612|          CHEMBL2034|
|{[CHEMBL3544919],...|SEQUESTERING AGENT|Heparin sequester...|[{26937198, PubMe...|     CHEMBL3544919|            CHEMBL3544919|  2473107|       CHEMBL2364712|
|{[CHEMBL3989993],...|         INHIBITOR

In [154]:
# List of columns from target table
list_mechanism = ["action_type_moa",
            "target_chembl_id_moa"]

drug_to_moa = join_dataframes(drug_active, mechanism_renamed, "drugId", "molecule_chembl_id", list_mechanism).persist()
drug_to_moa.show()
drug_to_moa.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|action_type_moa|target_chembl_id_moa|
+----------------+-------------+----------

9341

In [39]:
# drug_to_moa_filtered = drug_to_moa.filter(mechanism["molecule_chembl_id"] == "CHEMBL1946170")
# drug_to_moa_filtered.show()

In [155]:
# Match target_chembl_id and target_chembl_id_moa within 1 drugId

from pyspark.sql.functions import col, broadcast, when

# Create a DataFrame of unique target_chembl_id_moa values
moa_df = drug_to_moa.select("target_chembl_id_moa").distinct().withColumnRenamed("target_chembl_id_moa", "moa_value")

# Join with the original DataFrame to check if target_chembl_id is in the list of moa_values
df_joined = drug_to_moa.join(broadcast(moa_df), drug_to_moa["target_chembl_id"] == moa_df["moa_value"], "left_outer")

# Add a new column targetInMoA based on the condition
drug_to_moa_targetInMoA = df_joined.withColumn("targetInMoA", 
                                           when(df_joined["target_chembl_id_moa"].isNull(), None)
                                           .otherwise(col("moa_value").isNotNull())
                                          ).select(drug_to_moa.columns + ["targetInMoA"])

# Show the result
drug_to_moa_targetInMoA.show()
drug_to_moa_targetInMoA.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|action_type_moa|target_chembl_id_moa|targetInMoA|
+----------------+

9341

In [42]:
# drug_to_moa_targetInMoA_filtered = drug_to_moa_targetInMoA.filter(drug_to_moa_targetInMoA["drugId"] == "CHEMBL487273")
# drug_to_moa_targetInMoA_filtered.show()

### Targets: Evidence type: sources + classification for GE, clinical_GE, probes (boolean)

In [168]:
# Irene's table
evidence_path = "gs://ot-team/irene/drug_to_target"
evidence = spark.read.parquet(evidence_path)
evidence.persist()
evidence.show()


+----------+---------+---------------+--------------------+------------------+-------------------+
|    drugId|uniprotId|       targetId|             sources|isHighQualityProbe|isTherapeuticTarget|
+----------+---------+---------------+--------------------+------------------+-------------------+
|CHEMBL1000|   O00167|ENSG00000064655|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O00555|ENSG00000141837|[uniprot_literatu...|             false|              false|
|CHEMBL1000|   O14633|ENSG00000159455|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O60706|ENSG00000069431|            [chembl]|             false|               true|
|CHEMBL1000|   P00352|ENSG00000165092|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   P01567|ENSG00000214042|            [chembl]|             false|               true|
|CHEMBL1000|   P04155|ENSG00000160182|[ot_genetics_portal]|             false|              false|
|CHEMBL100

23/12/11 16:48:49 WARN CacheManager: Asked to cache already cached data.


In [158]:
def join_dataframes_by_many_cols(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_columns: list, 
                    second_key_columns: list,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_columns (list): The key column names in the initial DataFrame.
    second_key_columns (list): The key column names in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Ensure the key columns lists have the same length
    if len(initial_key_columns) != len(second_key_columns):
        raise ValueError("Key columns lists must be of the same length")

    # Selecting specified columns from the second DataFrame, including its key columns
    second_df_selected = second_df.select(second_key_columns + columns_to_join)

    # Build join condition
    join_condition = [initial_df[initial_col] == second_df_selected[second_col] 
                      for initial_col, second_col in zip(initial_key_columns, second_key_columns)]

    # Perform the left join
    joined_df = initial_df.join(second_df_selected, 
                                on=join_condition, 
                                how='left')

    # Drop the second key columns if not needed
    for col in second_key_columns:
        joined_df = joined_df.drop(second_df_selected[col])

    return joined_df


In [159]:
# Join by drugId and uniprotId = accession
evidence_list = ["targetId", "sources", "isHighQualityProbe", "isTherapeuticTarget"]

target_evidence = join_dataframes_by_many_cols(drug_to_moa_targetInMoA, 
                                            evidence, 
                                            ["drugId", "accession"], 
                                            ["drugId", "uniprotId"], 
                                             evidence_list).persist()
target_evidence.show()

23/12/11 16:35:49 WARN MemoryStore: Not enough space to cache broadcast_481 in memory! (computed 6.6 GiB so far)
23/12/11 16:35:49 WARN BlockManager: Persisting block broadcast_481 to disk instead.
23/12/11 16:36:25 WARN MemoryStore: Not enough space to cache broadcast_481 in memory! (computed 6.6 GiB so far)
[Stage 1927:>                                                       (0 + 1) / 1]

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|acti

                                                                                

In [160]:
# Classification of targets by genetic evidence support
from pyspark.sql.types import BooleanType

def is_ge(sources):
    undesired_lists = [
        ['chembl'],
        ['chemicalProbes'],
        ['chembl', 'chemicalProbes'],
        ['chemicalProbes', 'chembl']
    ]
    return not (sources in undesired_lists or sources is None)

def contains_chemical_probes(sources):
    return 'chemicalProbes' in sources if sources else False

# Register UDFs
is_ge_udf = udf(is_ge, BooleanType())
is_ge_clinical_udf = udf(is_ge_clinical, BooleanType())
contains_chemical_probes_udf = udf(contains_chemical_probes, BooleanType())

In [161]:
# Apply UDFs to create new columns
target_evidence_bool = target_evidence.withColumn("isGE", is_ge_udf("sources"))\
                        .withColumn("isGE_clinical", is_ge_clinical_udf("sources"))\
                        .withColumn("isProbe", contains_chemical_probes_udf("sources"))

target_evidence_bool.show()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

                                                                                

In [170]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

483

In [171]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

131

In [164]:
count_unique_values(target_evidence_bool, "drugId")

                                                                                

2287

In [165]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] != "true")
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

596

In [166]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] == True)
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

                                                                                

850

In [111]:
from pyspark.sql.functions import countDistinct

# Group by 'isApproved' and count distinct 'drugId's
drugId_count = target_evidence_bool.groupBy("isApproved").agg(countDistinct("drugId").alias("unique_drugId_count"))

# Show the result
drugId_count.show()




+----------+-------------------+
|isApproved|unique_drugId_count|
+----------+-------------------+
|      null|               1160|
|      true|               1318|
|     false|                895|
+----------+-------------------+



                                                                                

In [105]:
target_evidence_bool_probe_h = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True)
# target_evidence_bool_probe_h.show()
count_unique_values(target_evidence_bool_probe_h, "drugId")

187

In [106]:
target_evidence_bool_probe = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True)
# target_evidence_bool_probe.show()
count_unique_values(target_evidence_bool_probe, "drugId")

                                                                                

695

In [175]:
# No targets in MoA
target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["target_chembl_id_moa"].isNull())\
                        .filter(target_evidence_bool["IsActive"] == True)
show_unique_values_and_counts(target_evidence_no_moa.drop_duplicates(["drugId", "isApproved"]), "isApproved")

+----------+-----+
|isApproved|count|
+----------+-----+
|      null|  638|
|      true|  226|
|     false|  186|
+----------+-----+



In [176]:
# No targets in MoA for probes
target_evidence_no_moa_probes = target_evidence_bool\
                        .filter(target_evidence_bool["target_chembl_id_moa"].isNull())\
                        .filter(target_evidence_bool["IsActive"] == True)\
                        .filter(target_evidence_bool["isProbe"] == True)
count_unique_values(target_evidence_no_moa_probes, "drugId")

                                                                                

438

# Data coverage

### Drugs

In [None]:
# Filtering by:
#         1. max_phase ≠ 4 | max_phase = 4 | probes = TRUE
#         2. moa = NaN | moa ≠ NaN
#         3. drugActive = TRUE
#         4. GE = TRUE | clinical_GE = TRUE

In [None]:
# Number of clinical candidates/approved drugs/chemical probes for which:
#     1. there is no MoA and they are bioactive against some targets:
#         1. which have GE/GE+clinical evidence for any disease
#     2. there is MoA but they are bioactive against some other targets
#         1. which have GE/GE+clinical evidence for any disease

### Targets

In [177]:
# Dataset with only pchembl value activity
target_evidence_bool.show()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

In [179]:
show_unique_values_and_counts(target_evidence_bool, "drugType")

+---------------+-----+
|       drugType|count|
+---------------+-----+
| Small molecule| 9218|
|Oligosaccharide|    2|
|           null|    1|
|        Protein|  113|
|       Antibody|    4|
|        Unknown|   14|
+---------------+-----+



In [182]:
# Number of targets supported by GE which have active bioassays

targets_GE = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)

count_unique_values(targets_GE, "target_chembl_id")

                                                                                

326

In [184]:
# Number of targets supported by GE_clinical which have active bioassays

targets_GE_clinical = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)

count_unique_values(targets_GE_clinical, "target_chembl_id")

                                                                                

431

In [185]:
# Number of targets supported by GE which have active bioassays and not in MoA of the drug

targets_GE_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_noMoA, "target_chembl_id")

                                                                                

115

In [186]:
# Number of targets supported by GE_clinical which have active bioassays and not in MoA of the drug

targets_GE_clinical_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_clinical_noMoA, "target_chembl_id")

                                                                                

145

## Action type

In [190]:
# Taking table target_organism_filter (after pchembl filter)

target_evidence_bool.show()
target_evidence_bool.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

9352

In [192]:
count_unique_values(target_evidence_bool, "drugId")

2287

In [191]:
show_unique_values_and_counts(target_evidence_bool.drop_duplicates(["drugId", "action_type_moa"]), "action_type_moa")

+--------------------+-----+
|     action_type_moa|count|
+--------------------+-----+
|  NEGATIVE MODULATOR|    1|
|NEGATIVE ALLOSTER...|    4|
|          ANTAGONIST|  166|
|                null| 1442|
|       BINDING AGENT|    1|
|     PARTIAL AGONIST|    8|
|           ACTIVATOR|    1|
|             BLOCKER|   42|
|    DISRUPTING AGENT|    4|
|           INHIBITOR|  491|
|           SUBSTRATE|    1|
|POSITIVE ALLOSTER...|    8|
|     CHELATING AGENT|    1|
|  POSITIVE MODULATOR|    4|
|             AGONIST|  117|
|              OPENER|    5|
|     INVERSE AGONIST|    4|
|     RELEASING AGENT|    1|
|           MODULATOR|    8|
|      REDUCING AGENT|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
show_unique_values_and_counts(drug_to_moa, "action_type")

+--------------------+------+
|         action_type| count|
+--------------------+------+
|                null|211437|
|{INHIBITOR, Negat...|   124|
|{SUBSTRATE, Carri...|    47|
|{ANTAGONIST, Bind...|    41|
|{INVERSE AGONIST,...|     2|
|{AGONIST, Binds t...|     4|
|{ACTIVATOR, Posit...|     1|
+--------------------+------+



### Drug-Target pairs

In [None]:
# With non pharmacological action
