In [52]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql import Row
from pyspark.sql.functions import when
from pyspark.sql.functions import col, broadcast, when, max, expr, collect_list, concat_ws, array_contains, split, collect_list, array_distinct, collect_set
from pyspark.sql.types import BooleanType

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/23 12:13:27 INFO SparkEnv: Registering MapOutputTracker
24/01/23 12:13:27 INFO SparkEnv: Registering BlockManagerMaster
24/01/23 12:13:27 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
24/01/23 12:13:27 INFO SparkEnv: Registering OutputCommitCoordinator


# Data parsing

In [3]:
def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    second_columns_to_join_with_alias = ["b." + col for col in columns_to_join]

    return initial_df.alias("a")\
        .join(second_df_selected.alias("b"), 
            on = initial_df[initial_key_column] == second_df_selected[second_key_column], 
            how='left')\
        .select("a.*", *second_columns_to_join_with_alias)\
        .persist()

    

    # # Performing the left join
    # joined_df = initial_df.join(second_df_selected, 
    #                             initial_df[initial_key_column] == second_df_selected[second_key_column], 
    #                             how='left')

    # # Drop the second key column if not needed
    # joined_df = joined_df.drop(second_df_selected[second_key_column])

    # return joined_df


In [4]:
def join_dataframes_by_many_cols(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_columns: list, 
                    second_key_columns: list,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_columns (list): The key column names in the initial DataFrame.
    second_key_columns (list): The key column names in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Ensure the key columns lists have the same length
    if len(initial_key_columns) != len(second_key_columns):
        raise ValueError("Key columns lists must be of the same length")

    # Selecting specified columns from the second DataFrame, including its key columns
    second_df_selected = second_df.select(second_key_columns + columns_to_join)

    # Build join condition
    join_condition = [initial_df[initial_col] == second_df_selected[second_col] 
                      for initial_col, second_col in zip(initial_key_columns, second_key_columns)]

    # Perform the left join
    joined_df = initial_df.join(second_df_selected, 
                                on=join_condition, 
                                how='left')

    # Drop the second key columns if not needed
    for col in second_key_columns:
        joined_df = joined_df.drop(second_df_selected[col])

    return joined_df


In [5]:
def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [6]:
def show_unique_values_and_counts(df: DataFrame, column_name: str):
    """
    Shows unique values and their counts for a specified column in a Spark DataFrame.

    Parameters:
    df (DataFrame): The Spark DataFrame to analyze.
    column_name (str): The name of the column for which to count unique values.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    unique_values_counts = df.groupBy(column_name).count()
    unique_values_counts.show()


In [7]:
# Take list of unique drugs (obtained from target with evidence in Platform or chemProbes)
import pandas as pd

unique_drugs_pd_df = pd.read_csv("../data/drug_to_target_unique_drugs.csv")
drug_list = spark.createDataFrame(unique_drugs_pd_df)
# drug_list = spark.read.csv(drug_list_dir, header=True, inferSchema=True)
# drug_list.show()

In [8]:
# in_drugs = count_unique_values(drug_list, "drugId")
# print("Number of initial drugs: ", in_drugs)

### For each drug find a max phase of clinical trial

In [9]:
molecule_path = "gs://open-targets-data-releases/23.12/output/etl/json/molecule"
molecule = spark.read.json(molecule_path)
molecule.persist()

# List of columns from molecule table
list_molecule = ["drugType", 
                "maximumClinicalTrialPhase", 
                "isApproved", 
                "linkedTargets", 
                "linkedDiseases",
                "crossReferences"]
                 
# Join list of drugs and max_phase from molecule table
drug_list_phase = join_dataframes(drug_list, molecule, "drugId", "id", list_molecule).persist()

# # Show number of drugs by approval status
# print("Number of drugs by approval status: ")
# show_unique_values_and_counts(drug_list_phase, "isApproved")

24/01/23 12:13:55 WARN CacheManager: Asked to cache already cached data.        


In [38]:
molecule.show()

[Stage 225:>                                                        (0 + 1) / 1]

+---------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+-------------+--------------------+----------+--------------------+--------------------+-------------------------+--------------------+-------------+--------------------+--------------------+-------------------+
+---------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+-------------+--------------------+----------+--------------------+--------------------+-------------------------+--------------------+-------------+--------------------+--------------------+-------------------+
|          false|Cc1cc(CN2CCN(c3c(...|                null|                null|Small molecule drug.|Small molecule|           false|CHEMBL1086582|UUGWPYPNRZQDFO-UH...|      null|                null|                null|                     null|       CHEMBL1086582|         null|                  []|       

                                                                                

### For each unique drug find bioactivity data from chembl_33_activity

In [10]:
activity_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_activity.jsonl"
activity = spark.read.json(activity_path)
activity.persist()

# List of columns from activity table
list_activity = ["assay_chembl_id",
                "assay_type",
                "action_type",
                "pchembl_value",
                "standard_type",
                "standard_units",
                "standard_value",
                "standard_relation",
                "target_organism",
                "target_pref_name",
                "target_chembl_id",
                "data_validity_comment",
                "data_validity_description"]
                # "standard_flag",
                # "ligand_efficiency",
                # "assay_variant_mutation"
                # "assay_variant_accession"

# Join list of drugs and chembl_33_activity
drug_to_activity = join_dataframes(drug_list_phase, activity, "drugId", "molecule_chembl_id", list_activity)\
                                    .filter(col("assay_chembl_id").isNotNull()).persist()

# # Calculate for how many drugs and targets we have bioactivities
# drug_list_count = count_unique_values(drug_list, 'drugId')
# drug_to_activity_count = count_unique_values(drug_to_activity, 'drugId')
# drug_to_activity_count_targets = count_unique_values(drug_to_activity, 'target_chembl_id')

# print("Number of unique drugs from targets dataset: ", drug_list_count)
# print("Number of unique drugs with any bioactivities: ", drug_to_activity_count)
# print("Number of unique targets with any drug bioactivities: ", drug_to_activity_count_targets)

                                                                                

### For each bioactivity assay find parameters from chembl_33_assay

In [11]:
assay_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_assay.jsonl"
assay = spark.read.json(assay_path)
assay.persist()

# List of columns from assay table
list_assay = ["confidence_score",
            "confidence_description",
            "assay_category"]

drug_to_assay = join_dataframes(drug_to_activity, assay, "assay_chembl_id", "assay_chembl_id", list_assay).persist()

24/01/23 12:14:10 WARN CacheManager: Asked to cache already cached data.        


## Assay filters

In [12]:
# assay_type
assay_type_filter = drug_to_assay.filter(
    (col("assay_type") != "P") &
    (col("assay_type") != "U"))
# assay_type_d = count_unique_values(assay_type_filter, "drugId")
# assay_type_t = count_unique_values(assay_type_filter, "target_chembl_id")

# print("Unique drugs with bioactivities for non P and U assays:", assay_type_d)
# print("Unique targets with bioactivities for non P and U assays:", assay_type_t)

# confidence_score
confidence_score_filter = assay_type_filter.filter(col("confidence_score").isin([9, 7]))
# confidence_score_d = count_unique_values(confidence_score_filter, "drugId")
# confidence_score_t = count_unique_values(confidence_score_filter, "target_chembl_id")

# print("Unique drugs with bioactivities for single/homolog proteins:", confidence_score_d)
# print("Unique targets with bioactivities for single/homolog proteins:", confidence_score_t)

# target_organism
target_organism_filter = confidence_score_filter.filter(col("target_organism") == "Homo sapiens")
# target_organism_d = count_unique_values(target_organism_filter, "drugId")
# target_organism_t = count_unique_values(target_organism_filter, "target_chembl_id")
# drug_target_organism_t = target_organism_filter.count()

# print("Unique drug-target pairs with bioactivities for human targets:", drug_target_organism_t)

# print("Unique drugs with bioactivities for human targets:", target_organism_d)
# print("Unique targets with bioactivities for human targets:", target_organism_t)

In [13]:
target_organism_filter.count()

                                                                                

18860

#### Adding related targets (to find tagrets within complexes in future)

In [14]:
target_relation_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_target_relation.jsonl"
target_relation = spark.read.json(target_relation_path)
target_relation.persist()
# target_relation.show()

list_target_relation = ["related_target_chembl_id"]

target_relation_add = join_dataframes(target_organism_filter, target_relation, "target_chembl_id", "target_chembl_id", list_target_relation).persist()
target_relation_add.show()
# # Aggregate the data
# aggregated_df = target_relation_add.groupBy('target_chembl_id').agg(
#     collect_list('related_target_chembl_id').alias('related_target_chembl_id_aggr')
# )

# # Merge the Aggregated Data with target_organism_filter
# target_with_relation = target_organism_filter.join(aggregated_df, on='target_chembl_id', how='left')
# target_with_relation.show()
# target_with_relation.count()

24/01/23 12:14:38 WARN CacheManager: Asked to cache already cached data.


+-------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+---------------------+-------------------------+----------------+----------------------+--------------+------------------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|     crossReferences|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|related_target_chembl_id|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+--------------

In [15]:
# show_unique_values_and_counts(target_relation_add, "relationship")

### Protein classification by uniprot from SwissProt

In [16]:
# Map target_chembl_id to uniprots via accession

target_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_target.jsonl"
target = spark.read.json(target_path)
target.persist()

list_target = ["target_components",
            "target_type"]

target_to_uniprot = join_dataframes(target_relation_add, target, "target_chembl_id", "target_chembl_id", list_target).persist()

# Explode target_components
# Define a UDF to extract the 'accession' field
def extract_accession(rows):
    # Assuming you want to extract the 'accession' from the first Row object in the list
    return rows[0].accession if rows else None

# Register UDF
extract_accession_udf = udf(extract_accession, StringType())

# Apply UDF to create a new column with the 'accession' values
target_to_uniprot_extr = target_to_uniprot.withColumn("accession", extract_accession_udf(target_to_uniprot["target_components"])).drop("target_components")

24/01/23 12:14:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/01/23 12:14:41 WARN CacheManager: Asked to cache already cached data.


In [17]:
target_to_uniprot_extr.show()

[Stage 29:>                                                         (0 + 1) / 1]

+-------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+---------------------+-------------------------+----------------+----------------------+--------------+------------------------+--------------+---------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|     crossReferences|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|related_target_chembl_id|   target_type|accession|
+-------------+--------------+-------------------------+----------+--------------------+------

                                                                                

In [18]:
# Make the same but for related targets
list_target_2 = ["target_components"]

target_to_uniprot_extr_related = join_dataframes(target_to_uniprot_extr, target, "related_target_chembl_id", "target_chembl_id", list_target_2).persist()

# Apply UDF to create a new column with the 'accession' values
related_extr = target_to_uniprot_extr_related.withColumn("accession_related", extract_accession_udf(target_to_uniprot_extr_related["target_components"])).drop("target_components")
related_extr.show()

24/01/23 12:14:45 WARN CacheManager: Asked to cache already cached data.


+-------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+---------------------+-------------------------+----------------+----------------------+--------------+------------------------+--------------+---------+-----------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|     crossReferences|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|related_target_chembl_id|   target_type|accession|accession_related|
+-------------+--------------+-------------------------+--

                                                                                

In [19]:
# target_to_uniprot_extr_related.columns

In [25]:
# Define a UDF to remove duplicates from a list
def remove_duplicates(lst):
    return list(set(lst))

remove_duplicates_udf = udf(remove_duplicates, ArrayType(StringType()))

# Aggregate the data
aggregated_df = related_extr.groupBy('target_chembl_id').agg(
    collect_list('related_target_chembl_id').alias('related_target_chembl_id_aggr'),
    collect_list('accession_related').alias('accession_related_aggr')
)

aggregated_df = aggregated_df\
    .withColumn('related_target_chembl_id_aggr', remove_duplicates_udf(col('related_target_chembl_id_aggr')))\
    .withColumn('accession_related_aggr', remove_duplicates_udf(col('accession_related_aggr')))

# Merge the Aggregated Data with target_organism_filter
targets_with_relation = related_extr\
    .drop("related_target_chembl_id", "accession_related")\
    .join(aggregated_df, on='target_chembl_id', how='left')\
    .dropDuplicates()                                
targets_with_relation.show()
targets_with_relation.count()

                                                                                

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+---------------+----------+-----------+-------------+-----------------+--------------+--------------+-----------------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------+---------+-----------------------------+----------------------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|     crossReferences|assay_chembl_id|assay_type|action_type|pchembl_value|    standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_type|accession|related_target_chembl_id_aggr|accession_related_aggr|
+----------------+------------

                                                                                

18847

In [21]:
# target_organism_filter.dropDuplicates().count()

                                                                                

18847

In [45]:
# assert target_organism_filter.dropDuplicates().count() == targets_with_relation.count()

                                                                                

AssertionError: 

In [19]:
# # Check that all accessions are present for CHEMBL3544932
# target_to_uniprot_filtered = target_to_uniprot.filter(target_to_uniprot["target_chembl_id"] == "CHEMBL3430907")
# # drug_to_moa_filtered.show()
# selected_column = target_to_uniprot_filtered.select("target_components").collect()
# print(selected_column)

[Row(target_components=[Row(accession='Q96GD4', component_description='Aurora kinase B', component_id=526, component_type='PROTEIN', relationship='PROTEIN SUBUNIT', target_component_synonyms=[Row(component_synonym='2.7.11.1', syn_type='EC_NUMBER'), Row(component_synonym='AIK2', syn_type='GENE_SYMBOL_OTHER'), Row(component_synonym='AIM1', syn_type='GENE_SYMBOL_OTHER'), Row(component_synonym='AIM-1', syn_type='UNIPROT'), Row(component_synonym='AIM1 GN  ', syn_type='GENE_SYMBOL_OTHER'), Row(component_synonym='AIRK2', syn_type='GENE_SYMBOL_OTHER'), Row(component_synonym='ARK2', syn_type='GENE_SYMBOL_OTHER'), Row(component_synonym='ARK-2', syn_type='UNIPROT'), Row(component_synonym='ARK2 GN  ', syn_type='GENE_SYMBOL_OTHER'), Row(component_synonym='AURKB', syn_type='GENE_SYMBOL'), Row(component_synonym='Aurora 1', syn_type='UNIPROT'), Row(component_synonym='Aurora- and IPL1-like midbody-associated protein 1', syn_type='UNIPROT'), Row(component_synonym='Aurora/IPL1-related kinase 2', syn_type

In [25]:
# # What does protein_classifications mean?

# target_component_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_target_component.jsonl"
# target_component = spark.read.json(target_component_path)
# target_component.persist()
# target_component.show()

In [27]:
proteinclass_path = pd.read_csv("../data/uniprot2family.csv")
proteinclass_str = proteinclass_path.astype(str).drop_duplicates()
proteinclass = spark.createDataFrame(proteinclass_str)

proteinclass_list = ["proteinClass"]

uniprot_to_class = join_dataframes(targets_with_relation, proteinclass, "accession", "accession", proteinclass_list).persist()

24/01/23 13:29:23 WARN CacheManager: Asked to cache already cached data.


In [29]:
show_unique_values_and_counts(uniprot_to_class, 'proteinClass')



+--------------+-----+
|  proteinClass|count|
+--------------+-----+
|        Enzyme| 5465|
|          GPCR| 1789|
|        Kinase| 7317|
|            TF|  208|
|          None| 1664|
|            IC|  359|
|            NR|  660|
|    Epigenetic|  768|
|   Transporter|  583|
|TF; Epigenetic|   29|
|          null|    5|
+--------------+-----+



                                                                                

##### maybe for None classification based on accession_related_aggr needed

In [30]:
uniprot_to_class.count()

18847

In [29]:
# uniprot_to_class.write.parquet("gs://ot-team/polina/uniprot_to_class")

## Drug activity threshold based on pchembl_value and proteinClass

### Dataset with only pchembl values

In [31]:
# Check how many assays have pchembl_value
pchembl_value_only = uniprot_to_class.filter(uniprot_to_class["pchembl_value"].isNotNull())
# pchembl_value_only_n = pchembl_value_only.count()
# d_t_n = input.count()

# print("Number of drug-target pairs:", d_t_n)
# print("Number of drug-target pairs with pchembl_value:", pchembl_value_only_n)

### For each T-D pairs make new columns:
####     1. max_pchembl_value
####     2. median_pchembl_value

In [32]:
# Original aggregation with additional count of non-null pchembl_values
pchembl_value_aggr = pchembl_value_only.groupBy("target_chembl_id", "drugId")\
                  .agg(f.max("pchembl_value").alias("max_pchembl_value"),
                       f.expr("percentile_approx(pchembl_value, 0.5)").alias("median_pchembl_value"))

# Join original data with filtered aggregated data
pchembl_value_join = pchembl_value_aggr.join(uniprot_to_class, ["target_chembl_id", "drugId"], "left_outer")

In [33]:
# Leave only unique dug-target pairs with pchembl value

# Group by 'target_chembl_id' and 'drugId', and aggregate 'pchembl_value'
pchembl_value_concat = pchembl_value_join.groupBy("target_chembl_id", "drugId")\
                        .agg(concat_ws(", ", collect_list("pchembl_value")).alias("pchembl_value_aggr"))

pchembl_value_drop = pchembl_value_join.drop("standard_type", "standard_units", "standard_value", "standard_relation", "pchembl_value")
pchembl_value_uniq = pchembl_value_drop.dropDuplicates(["target_chembl_id", "drugId"])

pchembl_value_concat_join = pchembl_value_concat.join(pchembl_value_uniq, ["target_chembl_id", "drugId"], "left_outer")

# pchembl_value_concat_join.show()

### Activity cutoff for max_pchembl_value and median_pchembl_value

In [34]:
max_pchembl_value = col("max_pchembl_value")
med_pchembl_value = col("median_pchembl_value")

pchembl_max_activity = pchembl_value_concat_join.withColumn(
    "isActive_max",
    when(
        ((col("proteinClass") == "Kinase") & (max_pchembl_value >= 7.7)) |
        ((col("proteinClass") == "GPCR") & (max_pchembl_value >= 6.5)) |
        ((col("proteinClass") == "NR") & (max_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Transporter") & (max_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Enzyme") & (max_pchembl_value >= 5.2)) |
        ((col("proteinClass") == "IC") & (max_pchembl_value >= 4.6)) |
        ((col("proteinClass") == "Other") & (max_pchembl_value >= 6.3)) |
        (~(col("proteinClass").isin(["Kinase", "GPCR", "NR", "Transporter", "Enzyme", "IC", "Other"])) & (max_pchembl_value >= 5)),
        "TRUE"
    ).otherwise("FALSE")
)

pchembl_activity = pchembl_max_activity.withColumn(
    "isActive_med",
    when(
        ((col("proteinClass") == "Kinase") & (med_pchembl_value >= 7.7)) |
        ((col("proteinClass") == "GPCR") & (med_pchembl_value >= 6.5)) |
        ((col("proteinClass") == "NR") & (med_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Transporter") & (med_pchembl_value >= 6.1)) |
        ((col("proteinClass") == "Enzyme") & (med_pchembl_value >= 5.2)) |
        ((col("proteinClass") == "IC") & (med_pchembl_value >= 4.6)) |
        ((col("proteinClass") == "Other") & (med_pchembl_value >= 6.3)) |
        (~(col("proteinClass").isin(["Kinase", "GPCR", "NR", "Transporter", "Enzyme", "IC", "Other"])) & (med_pchembl_value >= 5)),
        "TRUE"
    ).otherwise("FALSE")
)

# max_active = pchembl_activity.filter(col("isActive_max") == True).count()
# med_active = pchembl_activity.filter(col("isActive_med") == True).count()
# all_pchembl = pchembl_activity.count()

# print("Number of drug-target pairs with pchembl values: ", all_pchembl)
# print("Number of drug-target pairs with active drugs based on max pchembl values: ", max_active)
# print("Number of drug-target pairs with active drugs based on med pchembl values: ", med_active)

## Non-pharmacological MoA search

### Target is in MoA of a drug?

In [35]:
mechanism_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_mechanism.jsonl"
mechanism = spark.read.json(mechanism_path)
mechanism.persist()
mechanism.show()

# Rename columns because target ids are coming from moa but not assay
mechanism_renamed = mechanism.withColumnRenamed("action_type", "action_type_moa")\
                            .withColumnRenamed("target_chembl_id", "target_chembl_id_moa")

                                                                                

+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|       action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL2103825],...|         INHIBITOR|Pancreatic lipase...|[{16953261, PubMe...|     CHEMBL2103825|            CHEMBL2103825|  1699800|      CHEMBL1812|
|{[CHEMBL1200495, ...|           AGONIST|Glucocorticoid re...|[{setid=6d9bf1b0-...|     CHEMBL1200495|                CHEMBL977|  1344612|      CHEMBL2034|
|{[CHEMBL3544919],...|SEQUESTERING AGENT|Heparin sequester...|[{26937198, PubMe...|     CHEMBL3544919|            CHEMBL3544919|  2473107|   CHEMBL2364712|
|{[CHEMBL3989993],...|         INHIBITOR|microRNA-155 inhi...|[{

##### need to make mapping for all parent_molecule_chembl_id and molecule_chembl_id!!!

In [36]:
# List of columns from target table molecule_chembl_id
list_mechanism = ["action_type_moa",
            "target_chembl_id_moa"]

drug_to_moa = join_dataframes(pchembl_activity, mechanism_renamed, "drugId", "molecule_chembl_id", list_mechanism).persist()

24/01/23 13:33:10 WARN CacheManager: Asked to cache already cached data.


#### code need to be optimised

In [40]:
# Rename columns because now we need to map to parent_molecule_chembl_id
mechanism_renamed_parent = mechanism.withColumnRenamed("action_type", "action_type_moa_parent")\
                            .withColumnRenamed("target_chembl_id", "target_chembl_id_moa_parent")

list_mechanism_parent = ["action_type_moa_parent",
            "target_chembl_id_moa_parent"]

drug_to_moa_parent = join_dataframes(drug_to_moa, mechanism_renamed_parent, "drugId", "parent_molecule_chembl_id", list_mechanism_parent).persist()

24/01/23 13:58:58 WARN CacheManager: Asked to cache already cached data.


In [47]:
drug_to_moa_parent.show()

+----------------+-------------+------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------+---------+-----------------------------+----------------------+------------+------------+------------+---------------+--------------------+----------------------+---------------------------+
|target_chembl_id|       drugId|pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|     crossReferences|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   target_type|accession|r

In [67]:
# Merge columns
drug_to_moa_parent_merge = drug_to_moa_parent.withColumn(
    "action_type_moa_merge",
    concat_ws(", ", "action_type_moa_parent", "action_type_moa")
).withColumn(
    "target_chembl_id_moa_merge",
    concat_ws(", ", "target_chembl_id_moa_parent", "target_chembl_id_moa")
)

# Aggregate by drugId
drug_to_moa_parent_aggr = drug_to_moa_parent_merge.groupBy("drugId").agg(
    collect_list("action_type_moa_merge").alias("action_type_moa_aggr"),
    collect_list("target_chembl_id_moa_merge").alias("target_chembl_id_moa_aggr")
)

# Define a UDF to remove duplicates within a concatenated string
def remove_internal_duplicates(concatenated_str):
    unique_items = set(concatenated_str.split(", "))
    return ", ".join(unique_items)

# Register the UDF
remove_internal_duplicates_udf = udf(remove_internal_duplicates, StringType())

# Apply UDF to remove duplicates within the concatenated strings
drug_to_moa_parent_duplicates = drug_to_moa_parent_aggr.withColumn(
    "action_type_moa_aggr",
    remove_internal_duplicates_udf(f.concat_ws(", ", "action_type_moa_aggr"))
).withColumn(
    "target_chembl_id_moa_aggr",
    remove_internal_duplicates_udf(f.concat_ws(", ", "target_chembl_id_moa_aggr"))
)
        
# Join the aggregated data back to the original drug_to_moa_parent and drop the old columns and duplicates
drug_to_moa_final = drug_to_moa_parent.join(
    drug_to_moa_parent_duplicates, ["drugId"], how="left"
)\
    .drop(
    "action_type_moa_parent", "action_type_moa", "target_chembl_id_moa_parent", "target_chembl_id_moa"
)\
    .dropDuplicates()

drug_to_moa_final.show()
drug_to_moa_final.count()

+-------------+----------------+------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+---------------+---------+-----------------------------+----------------------+------------+------------+------------+--------------------+-------------------------+
|       drugId|target_chembl_id|pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|     crossReferences|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|    target_type|accession|related_target_chembl_id_aggr|accession_

                                                                                

4647

In [37]:
# # Concat moa to 1 row
# drug_to_moa_concat = drug_to_moa.groupBy("target_chembl_id", "drugId")\
#     .agg(concat_ws(", ", array_distinct(collect_list("target_chembl_id_moa"))).alias("target_chembl_id_moa_aggr"))

# drug_to_moa_drop = drug_to_moa.drop("target_chembl_id_moa")
# drug_to_moa_uniq = drug_to_moa_drop.dropDuplicates(["target_chembl_id", "drugId"])

# drug_to_moa_join = drug_to_moa_concat.join(drug_to_moa_uniq, ["target_chembl_id", "drugId"], "left_outer")

# drug_to_moa_join.count()

                                                                                

4647

In [None]:
# mechanism_filtered = mechanism.filter(mechanism["parent_molecule_chembl_id"] == "CHEMBL3545181")
# mechanism_filtered.show()

+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL1200710, ...|  INHIBITOR|Serotonin transpo...|[{setid=4074b555-...|     CHEMBL1200710|                CHEMBL415|  1343314|       CHEMBL228|
+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+



In [None]:
# drug_to_moa_filtered = drug_to_moa_join.filter(drug_to_moa_join["drugId"] == "CHEMBL715")
# # drug_to_moa_filtered.show()
# selected_column = drug_to_moa_filtered.select("target_chembl_id_moa_aggr").collect()
# print(selected_column)

[Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224'), Row(target_chembl_id_moa_aggr='CHEMBL2331075, CHEMBL225, CHEMBL224')]


In [None]:
# drug_to_moa_join.printSchema()

root
 |-- target_chembl_id: string (nullable = true)
 |-- drugId: string (nullable = true)
 |-- target_chembl_id_moa_aggr: string (nullable = false)
 |-- max_pchembl_value: string (nullable = true)
 |-- median_pchembl_value: double (nullable = true)
 |-- drugType: string (nullable = true)
 |-- maximumClinicalTrialPhase: double (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- linkedTargets: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- linkedDiseases: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- assay_chembl_id: string (nullable = true)
 |-- assay_type: string (nullable = true)
 |-- action_type: struct (nullable = true)
 |    |-- action_type: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- parent_type: st

In [39]:
# drug_to_moa_filtered = drug_to_moa_join.filter(drug_to_moa_join["drugId"] == "CHEMBL715")
# # drug_to_moa_filtered.show()
# selected_column = drug_to_moa_filtered.select("linkedTargets").collect()
# print(selected_column)

[Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246', 'ENSG00000102468', 'ENSG00000149295', 'ENSG00000151577', 'ENSG00000069696'])), Row(linkedTargets=Row(count=5, rows=['ENSG00000147246'

##### just in case: I'm not using linkedTargets as a source of MoA because it contains only ensembl IDs

In [44]:
# Add a new column 'isInMoA'

drug_to_moa_isInMoA = drug_to_moa_join.withColumn(
    "isInMoA", 
    when(
        col("linkedTargets") == "", None  # Check for empty string
    ).otherwise(
        array_contains(split(col("target_chembl_id_moa_aggr"), ", "), col("target_chembl_id"))
    )
)

# drug_to_moa_isInMoA.count()

In [None]:
# drug_to_moa_isInMoA_filtered = drug_to_moa_isInMoA.filter(drug_to_moa_isInMoA["drugId"] == "CHEMBL2105717")

# selected_column = drug_to_moa_isInMoA_filtered.select("isInMoA", "drugId", "target_chembl_id_moa_aggr", "target_chembl_id")
# selected_column.show()

+-------+-------------+-------------------------+----------------+
|isInMoA|       drugId|target_chembl_id_moa_aggr|target_chembl_id|
+-------+-------------+-------------------------+----------------+
|  false|CHEMBL2105717|     CHEMBL279, CHEMBL...|      CHEMBL1936|
|  false|CHEMBL2105717|     CHEMBL279, CHEMBL...|      CHEMBL4722|
|   true|CHEMBL2105717|     CHEMBL279, CHEMBL...|      CHEMBL3717|
|   true|CHEMBL2105717|     CHEMBL279, CHEMBL...|       CHEMBL279|
|  false|CHEMBL2105717|     CHEMBL279, CHEMBL...|       CHEMBL203|
+-------+-------------+-------------------------+----------------+



### Targets: Evidence type: sources + classification for GE, clinical_GE, probes (boolean)

In [45]:
# Irene's table
evidence_path = "gs://ot-team/irene/drug_to_target"
evidence = spark.read.parquet(evidence_path)
evidence.persist()
evidence.show()

[Stage 109:>                                                        (0 + 1) / 1]

+----------+---------+---------------+--------------------+------------------+-------------------+
|    drugId|uniprotId|       targetId|             sources|isHighQualityProbe|isTherapeuticTarget|
+----------+---------+---------------+--------------------+------------------+-------------------+
|CHEMBL1000|   O00167|ENSG00000064655|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O00555|ENSG00000141837|[uniprot_literatu...|             false|              false|
|CHEMBL1000|   O14633|ENSG00000159455|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O60706|ENSG00000069431|            [chembl]|             false|               true|
|CHEMBL1000|   P00352|ENSG00000165092|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   P01567|ENSG00000214042|            [chembl]|             false|               true|
|CHEMBL1000|   P04155|ENSG00000160182|[ot_genetics_portal]|             false|              false|
|CHEMBL100

                                                                                

In [47]:
# Join by drugId and uniprotId = accession
evidence_list = ["targetId", "sources", "isHighQualityProbe", "isTherapeuticTarget"]

target_evidence = join_dataframes_by_many_cols(drug_to_moa_isInMoA, 
                                            evidence, 
                                            ["drugId", "accession"], 
                                            ["drugId", "uniprotId"], 
                                             evidence_list).persist()
# target_evidence.show()

In [48]:
target_evidence.count()

23/12/20 13:16:01 WARN MemoryStore: Not enough space to cache broadcast_64 in memory! (computed 6.6 GiB so far)
23/12/20 13:16:01 WARN BlockManager: Persisting block broadcast_64 to disk instead.
ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/miniconda3/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/miniconda3/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/miniconda3/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [49]:
# Classification of targets by genetic evidence support
def is_ge(sources):
    undesired_lists = [
        ['chembl'],
        ['chemicalProbes'],
        ['chembl', 'chemicalProbes'],
        ['chemicalProbes', 'chembl']
    ]
    return not (sources in undesired_lists or sources is None)

def is_ge_clinical(sources):
    undesired_lists = [['chemicalProbes']]
    return not (sources in undesired_lists or sources is None)

def contains_chemical_probes(sources):
    return 'chemicalProbes' in sources if sources else False

# Register UDFs
is_ge_udf = udf(is_ge, BooleanType())
is_ge_clinical_udf = udf(is_ge_clinical, BooleanType())
contains_chemical_probes_udf = udf(contains_chemical_probes, BooleanType())

# Apply UDFs to create new columns
target_evidence_bool = target_evidence.withColumn("isGE", is_ge_udf("sources"))\
                        .withColumn("isGE_clinical", is_ge_clinical_udf("sources"))\
                        .withColumn("isProbe", contains_chemical_probes_udf("sources"))

# target_evidence_bool.show()
# target_evidence_bool.count()

23/12/20 13:16:38 WARN MemoryStore: Not enough space to cache broadcast_64 in memory! (computed 6.6 GiB so far)
                                                                                

In [51]:
target_evidence_bool.write.parquet("gs://ot-team/polina/target_evidence_bool_4")

23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_116_70 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_160_143 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_237_141 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_160_105 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_237_162 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_283_129 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_143_137 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_237_45 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_283_144 !
23/12/20 13:24:07 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_137_160 !
23/12/20 13:24:07 WARN BlockManagerMasterE

### Counts targets with non-pharmacological MoA

In [None]:
# How many targets:
#   have assays for active drugs 
#   not in MoA of these drugs
#   supported by GE or clinical evidence

target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["IsActive_max"] == True)\
                        .filter(target_evidence_bool["isInMoA"].isNull())\
                        .filter(target_evidence_bool["isGE_clinical"] == True)\

count_unique_values(target_evidence_no_moa.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

135

In [None]:
# How many targets:
#   have assays for active drugs 
#   not in MoA of these drugs
#   supported by GE

target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["IsActive_max"] == True)\
                        .filter(target_evidence_bool["isInMoA"].isNull())\
                        .filter(target_evidence_bool["isGE"] == True)\

count_unique_values(target_evidence_no_moa.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

83

In [None]:
# How many targets:
#   have assays for active drugs (pchembl median)
#   not in MoA of these drugs
#   supported by GE or clinical evidence

target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["IsActive_med"] == True)\
                        .filter(target_evidence_bool["isInMoA"].isNull())\
                        .filter(target_evidence_bool["isGE_clinical"] == True)\

count_unique_values(target_evidence_no_moa.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

133

In [None]:
target_evidence_no_moa.count()

                                                                                

239

In [None]:
# No targets in MoA for probes
target_evidence_no_moa_probes = target_evidence_bool\
                        .filter(target_evidence_bool["target_chembl_id_moa"].isNull())\
                        .filter(target_evidence_bool["IsActive"] == True)\
                        .filter(target_evidence_bool["isProbe"] == True)
count_unique_values(target_evidence_no_moa_probes, "drugId")

                                                                                

438

In [None]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

483

In [None]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

131

In [None]:
count_unique_values(target_evidence_bool, "drugId")

                                                                                

2287

In [None]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] != "true")
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

596

In [None]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] == True)
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

                                                                                

850

In [None]:
from pyspark.sql.functions import countDistinct

# Group by 'isApproved' and count distinct 'drugId's
drugId_count = target_evidence_bool.groupBy("isApproved").agg(countDistinct("drugId").alias("unique_drugId_count"))

# Show the result
drugId_count.show()




+----------+-------------------+
|isApproved|unique_drugId_count|
+----------+-------------------+
|      null|               1160|
|      true|               1318|
|     false|                895|
+----------+-------------------+



                                                                                

In [None]:
target_evidence_bool_probe_h = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True)
# target_evidence_bool_probe_h.show()
count_unique_values(target_evidence_bool_probe_h, "drugId")

187

In [None]:
target_evidence_bool_probe = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True)
# target_evidence_bool_probe.show()
count_unique_values(target_evidence_bool_probe, "drugId")

                                                                                

695

# Data coverage

### Drugs

In [None]:
# Filtering by:
#         1. max_phase ≠ 4 | max_phase = 4 | probes = TRUE
#         2. moa = NaN | moa ≠ NaN
#         3. drugActive = TRUE
#         4. GE = TRUE | clinical_GE = TRUE

In [None]:
# Number of clinical candidates/approved drugs/chemical probes for which:
#     1. there is no MoA and they are bioactive against some targets:
#         1. which have GE/GE+clinical evidence for any disease
#     2. there is MoA but they are bioactive against some other targets
#         1. which have GE/GE+clinical evidence for any disease

### Targets

In [None]:
# Dataset with only pchembl value activity
target_evidence_bool.show()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

In [None]:
show_unique_values_and_counts(target_evidence_bool, "drugType")

+---------------+-----+
|       drugType|count|
+---------------+-----+
| Small molecule| 9218|
|Oligosaccharide|    2|
|           null|    1|
|        Protein|  113|
|       Antibody|    4|
|        Unknown|   14|
+---------------+-----+



In [None]:
# Number of targets supported by GE which have active bioassays

targets_GE = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)

count_unique_values(targets_GE, "target_chembl_id")

                                                                                

326

In [None]:
# Number of targets supported by GE_clinical which have active bioassays

targets_GE_clinical = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)

count_unique_values(targets_GE_clinical, "target_chembl_id")

                                                                                

431

In [None]:
# Number of targets supported by GE which have active bioassays and not in MoA of the drug

targets_GE_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_noMoA, "target_chembl_id")

                                                                                

115

In [None]:
# Number of targets supported by GE_clinical which have active bioassays and not in MoA of the drug

targets_GE_clinical_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_clinical_noMoA, "target_chembl_id")

                                                                                

145

## Action type

In [None]:
# Taking table target_organism_filter (after pchembl filter)

target_evidence_bool.show()
target_evidence_bool.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

9352

In [None]:
count_unique_values(target_evidence_bool, "drugId")

2287

In [None]:
show_unique_values_and_counts(target_evidence_bool.drop_duplicates(["drugId", "action_type_moa"]), "action_type_moa")

+--------------------+-----+
|     action_type_moa|count|
+--------------------+-----+
|  NEGATIVE MODULATOR|    1|
|NEGATIVE ALLOSTER...|    4|
|          ANTAGONIST|  166|
|                null| 1442|
|       BINDING AGENT|    1|
|     PARTIAL AGONIST|    8|
|           ACTIVATOR|    1|
|             BLOCKER|   42|
|    DISRUPTING AGENT|    4|
|           INHIBITOR|  491|
|           SUBSTRATE|    1|
|POSITIVE ALLOSTER...|    8|
|     CHELATING AGENT|    1|
|  POSITIVE MODULATOR|    4|
|             AGONIST|  117|
|              OPENER|    5|
|     INVERSE AGONIST|    4|
|     RELEASING AGENT|    1|
|           MODULATOR|    8|
|      REDUCING AGENT|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
show_unique_values_and_counts(drug_to_moa, "action_type")

+--------------------+------+
|         action_type| count|
+--------------------+------+
|                null|211437|
|{INHIBITOR, Negat...|   124|
|{SUBSTRATE, Carri...|    47|
|{ANTAGONIST, Bind...|    41|
|{INVERSE AGONIST,...|     2|
|{AGONIST, Binds t...|     4|
|{ACTIVATOR, Posit...|     1|
+--------------------+------+



### Drug-Target pairs