In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [2]:
# import os
# print(os.getcwd())

/home/polina/bioactivity/code


In [3]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/11 13:41:04 INFO SparkEnv: Registering MapOutputTracker
23/12/11 13:41:05 INFO SparkEnv: Registering BlockManagerMaster
23/12/11 13:41:05 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
23/12/11 13:41:05 INFO SparkEnv: Registering OutputCommitCoordinator


# Data parsing

In [4]:
def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    # Performing the left join
    joined_df = initial_df.join(second_df_selected, 
                                initial_df[initial_key_column] == second_df_selected[second_key_column], 
                                how='left')

    # Drop the second key column if not needed
    joined_df = joined_df.drop(second_df_selected[second_key_column])

    return joined_df


In [5]:
def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [6]:
def show_unique_values_and_counts(df: DataFrame, column_name: str):
    """
    Shows unique values and their counts for a specified column in a Spark DataFrame.

    Parameters:
    df (DataFrame): The Spark DataFrame to analyze.
    column_name (str): The name of the column for which to count unique values.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    unique_values_counts = df.groupBy(column_name).count()
    unique_values_counts.show()


In [7]:
# Take list of unique drugs (obtained from target with evidence in Platform or chemProbes)
import pandas as pd

unique_drugs_pd_df = pd.read_csv("../data/drug_to_target_unique_drugs.csv")
drug_list = spark.createDataFrame(unique_drugs_pd_df)
# drug_list = spark.read.csv(drug_list_dir, header=True, inferSchema=True)
drug_list.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------------+
|       drugId|
+-------------+
|CHEMBL1200632|
|   CHEMBL1231|
|CHEMBL1233511|
|   CHEMBL1637|
|CHEMBL1743017|
| CHEMBL185885|
|CHEMBL1949708|
|CHEMBL2105675|
|CHEMBL2107826|
|CHEMBL2109673|
|CHEMBL2346976|
| CHEMBL279115|
|CHEMBL3181832|
|CHEMBL3545096|
|CHEMBL3545103|
|CHEMBL3545145|
|CHEMBL3545312|
| CHEMBL363648|
|CHEMBL3707249|
|CHEMBL3989766|
+-------------+
only showing top 20 rows



                                                                                

In [8]:
count_unique_values(drug_list, "drugId")

                                                                                

12835

### For each drug find a max phase of clinical trial

In [9]:
molecule_path = "gs://open-targets-data-releases/23.12/output/etl/json/molecule"
molecule = spark.read.json(molecule_path)
molecule.persist()
molecule.show()

[Stage 9:>                                                          (0 + 1) / 1]

+---------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+-------------+--------------------+----------+--------------------+--------------------+-------------------------+--------------------+-------------+--------------------+--------------------+-------------------+
+---------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+-------------+--------------------+----------+--------------------+--------------------+-------------------------+--------------------+-------------+--------------------+--------------------+-------------------+
|          false|Cc1cc(CN2CCN(c3c(...|                null|                null|Small molecule drug.|Small molecule|           false|CHEMBL1086582|UUGWPYPNRZQDFO-UH...|      null|                null|                null|                     null|       CHEMBL1086582|         null|                  []|       

                                                                                

In [10]:
# List of columns from molecule table
list_molecule = ["drugType", 
                "maximumClinicalTrialPhase", 
                "isApproved", 
                "linkedTargets", 
                "linkedDiseases"]
                 
# Join list of drugs and max_phase from molecule table
drug_list_phase = join_dataframes(drug_list, molecule, "drugId", "id", list_molecule).persist()
drug_list_phase.show()

                                                                                

+-------------+--------------+-------------------------+----------+--------------------+--------------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+
|CHEMBL1200632|Small molecule|                      4.0|      true|             {0, []}|{3, [EFO_0003894,...|
|   CHEMBL1231|Small molecule|                      4.0|      true|{2, [ENSG00000133...|{15, [EFO_1000781...|
|CHEMBL1233511|Small molecule|                      3.0|     false|                null|  {1, [EFO_0002950]}|
|   CHEMBL1637|Small molecule|                      4.0|      true|{14, [ENSG0000010...|{54, [MONDO_00081...|
|CHEMBL1743017|      Antibody|                      2.0|     false|{1, [ENSG00000127...|{3, [EFO_0000676,...|
| CHEMBL185885|Small molecule|                      2.0|     false|                null|{2, [EFO_0009444,...|
|CHEMBL194

In [14]:
approved_drugs = drug_list_phase.filter(col("maximumClinicalTrialPhase") == 4.0)
count_unique_values(approved_drugs, "drugId")

3340

### For each unique drug find bioactivity data from chembl_33_activity

In [15]:
activity_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_activity.jsonl"
activity = spark.read.json(activity_path)
activity.persist()
activity.show()

[Stage 19:>                                                         (0 + 1) / 1]

+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+------------------+-------------+-------------+-----------------+-------------------+-------------+--------------+--------------+----------------+--------------------+--------------------+
|           _metadata|action_type|assay_chembl_id|assay_type|assay_variant_accession|assay_variant_mutation|data_validity_comment|data_validity_description|document_chembl_id|document_year|ligand_efficiency|molecule_chembl_id|pchembl_value|standard_flag|standard_relation|standard_text_value|standard_type|standard_units|standard_value|target_chembl_id|     target_organism|    target_pref_name|
+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+-

                                                                                

In [16]:
# List of columns from activity table
list_activity = ["assay_chembl_id",
                "assay_type",
                "action_type",
                "pchembl_value",
                "standard_type",
                "standard_units",
                "standard_value",
                "standard_relation",
                "target_organism",
                "target_pref_name",
                "target_chembl_id"]
                # "standard_flag",
                # "ligand_efficiency",
                # "assay_variant_mutation"
                # "assay_variant_accession",
                # "data_validity_comment",
                # "data_validity_description"]

In [17]:
# Join list of drugs and chembl_33_activity
drug_to_activity = join_dataframes(drug_list_phase, activity, "drugId", "molecule_chembl_id", list_activity).filter(col("assay_chembl_id").isNotNull()).persist()
drug_to_activity.show()

+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|     target_organism|    target_pref_name|target_chembl_id|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+
|    CHEMBL106|Small molecule|                      4.0|      true|             {0, []}|{36, [EFO_0007228...|   CHEMBL958797|         F|       null|

In [23]:
drug_to_activity.count()

168729

In [18]:
# Calculate for how many drugs and targets we have biodata

drug_list_count = count_unique_values(drug_list, 'drugId')
drug_to_activity_count = count_unique_values(drug_to_activity, 'drugId')
drug_to_activity_count_targets = count_unique_values(drug_to_activity, 'target_chembl_id')

print("Number of unique drugs from targets dataset: ", drug_list_count)
print("Number of unique drugs with any bioactivities: ", drug_to_activity_count)
print("Number of unique targets with any drug bioactivities: ", drug_to_activity_count_targets)

                                                                                

Number of unique drugs from targets dataset:  12835
Number of unique drugs with any bioactivities:  6215
Number of unique targets with any drug bioactivities:  4848


### For each bioactivity assay find parameters from chembl_33_assay

In [19]:
assay_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_assay.jsonl"
assay = spark.read.json(assay_path)
assay.persist()
assay.show()

[Stage 41:>                                                         (0 + 1) / 1]

+--------------------+--------------+---------------+--------------------+---------------+----------+----------------------+----------------+----------------+
|           _metadata|assay_category|assay_chembl_id|      assay_organism|assay_test_type|assay_type|confidence_description|confidence_score|variant_sequence|
+--------------------+--------------+---------------+--------------------+---------------+----------+----------------------+----------------+----------------+
|{{0 - Default val...|          null|   CHEMBL688540|                null|           null|         F|  Default value - T...|               0|            null|
|{{8 - Homologous ...|  confirmatory|  CHEMBL2114731|Mycobacterium tub...|           null|         F|  Homologous single...|               8|            null|
|{{8 - Homologous ...|          null|   CHEMBL643834|                null|           null|         B|  Homologous single...|               8|            null|
|{{8 - Homologous ...|  confirmatory|  CHEMBL2

                                                                                

In [20]:
# List of columns from assay table
list_assay = ["confidence_score",
            "confidence_description",
            "assay_category"]

In [21]:
drug_to_assay = join_dataframes(drug_to_activity, assay, "assay_chembl_id", "assay_chembl_id", list_assay).persist()
drug_to_assay.show()



+------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+----------------+----------------------+--------------+
|      drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|     target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|
+------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+----------------+----------------------+--

                                                                                

In [22]:
drug_to_assay.count()

                                                                                

168729

## Assay filters

In [26]:
assay_type_filter = drug_to_assay.filter(
    (col("assay_type") != "P") &
    (col("assay_type") != "U"))
assay_type_d = count_unique_values(assay_type_filter, "drugId")
assay_type_t = count_unique_values(assay_type_filter, "target_chembl_id")

confidence_score_filter = assay_type_filter.filter(col("confidence_score").isin([9, 7]))
confidence_score_d = count_unique_values(confidence_score_filter, "drugId")
confidence_score_t = count_unique_values(confidence_score_filter, "target_chembl_id")

target_organism_filter = confidence_score_filter.filter(col("target_organism") == "Homo sapiens")
target_organism_d = count_unique_values(target_organism_filter, "drugId")
target_organism_t = count_unique_values(target_organism_filter, "target_chembl_id")

print("Unique drugs with bioactivities for non P and U assays:", assay_type_d)
print("Unique targets with bioactivities for non P and U assays:", assay_type_t)

print("Unique drugs with bioactivities for single/homolog proteins:", confidence_score_d)
print("Unique targets with bioactivities for single/homolog proteins:", confidence_score_t)

print("Unique drugs with bioactivities for human targets:", target_organism_d)
print("Unique targets with bioactivities for human targets:", target_organism_t)


                                                                                

Unique drugs with bioactivities for non P and U assays: 6174
Unique targets with bioactivities for non P and U assays: 4848
Unique drugs with bioactivities for single/homolog proteins: 3663
Unique targets with bioactivities for single/homolog proteins: 2136
Unique drugs with bioactivities for human targets: 3373
Unique targets with bioactivities for human targets: 1369


In [42]:
target_organism_filter.count()

18860

### Protein classification

####  map target_chembl_id to uniprots via accession

In [44]:
target_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_target.jsonl"
target = spark.read.json(target_path)
target.persist()
target.show()



+--------------------+--------------------+----------------+--------------------+---------------+
|           _metadata|           pref_name|target_chembl_id|   target_components|    target_type|
+--------------------+--------------------+----------------+--------------------+---------------+
|{[{Sodium channel...|Sodium channel pr...|   CHEMBL4630763|[{Q14524, Sodium ...|PROTEIN COMPLEX|
|{[{A121, 100}, {H...|                A121|    CHEMBL613106|                  []|      CELL-LINE|
|{[{Spermatozoa, 1...|         Spermatozoa|    CHEMBL614870|                  []|      CELL-LINE|
|{[{Thioredoxin re...|Thioredoxin reduc...|      CHEMBL2403|[{Q9NNW7, Thiored...| SINGLE PROTEIN|
|{[{Fusarium oxysp...|  Fusarium oxysporum|    CHEMBL612648|                  []|       ORGANISM|
|{[{A375-SM, 100},...|             A375-SM|   CHEMBL4513121|                  []|      CELL-LINE|
|{[{Receptor-inter...|Receptor-interact...|   CHEMBL3784911|[{Q60855, Recepto...| SINGLE PROTEIN|
|{[{HCC44, 100}, {..

23/12/11 14:02:39 WARN CacheManager: Asked to cache already cached data.        


In [45]:
# List of columns from target table
list_target = ["target_components",
            "target_type"]

In [46]:
target_to_uniprot = join_dataframes(target_organism_filter, target, "target_chembl_id", "target_chembl_id", list_target).persist()
target_to_uniprot.show()

+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+------------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------+----------------------+--------------+--------------------+--------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|     standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|   target_components|   target_type|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+------------------+--------------+--------------+-----------------+---------------+-----

23/12/11 14:02:50 WARN CacheManager: Asked to cache already cached data.


In [47]:
# Explode target_components
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import Row

# Define a UDF to extract the 'accession' field
def extract_accession(rows):
    # Assuming you want to extract the 'accession' from the first Row object in the list
    return rows[0].accession if rows else None

# Register UDF
extract_accession_udf = udf(extract_accession, StringType())

In [48]:
# Apply UDF to create a new column with the 'accession' values
target_to_uniprot_extr = target_to_uniprot.withColumn("accession", extract_accession_udf(target_to_uniprot["target_components"]))

target_to_uniprot_extr.show()

+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+------------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------+----------------------+--------------+--------------------+--------------+---------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|     standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|   target_components|   target_type|accession|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+------------------+--------------+--------------+-----------------+-

In [49]:
target_to_uniprot_extr.count()

18860

####  temporal decision - get accession id from Uniprot web

In [24]:
# # Obtain the list of unique targets to map them to uniprots

# from pyspark.sql import SparkSession
# import pandas as pd

# def save_column_as_csv(df, column_name, output_path):
#     """
#     This function takes a Spark DataFrame, a column name, and an output path.
#     It will remove duplicates based on the specified column, convert the unique values of that column to a Pandas DataFrame,
#     and save it as a CSV file.

#     :param df: Spark DataFrame
#     :param column_name: The name of the column to process
#     :param output_path: The path to save the CSV file
#     """
#     # Ensure the column exists in the DataFrame
#     if column_name not in df.columns:
#         raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

#     # Select the specified column and drop duplicates
#     unique_values_df = df.select(column_name).distinct()

#     # Convert the Spark DataFrame to a Pandas DataFrame
#     unique_values_pd_df = unique_values_df.toPandas()

#     # Save the Pandas DataFrame as a CSV file
#     unique_values_pd_df.to_csv(output_path, index=False)


# save_column_as_csv(drug_to_assay, "target_chembl_id", "files/target_chembl_id.csv")

####  temporal decision - get accession id from Barabara's table

In [25]:
# # Get accession for target_chembl_id from Barbara's table

# barbara_table_path = pd.read_csv("../data/drug2target_bioactivities_chembl_33_grouped.csv")
# columns = ['accession', 'target_chembl_id']
# filtered_table = barbara_table_path[columns].astype(str).drop_duplicates()
# barbara_table = spark.createDataFrame(filtered_table)
# barbara_table.show()


In [26]:
# # Join target_chembl_id and accession from Barbara's table

# list_protein = ["accession"]
              
# target_to_uniprot = join_dataframes(drug_to_assay, barbara_table, "target_chembl_id", "target_chembl_id", list_protein).persist()

# # from pyspark.sql.functions import col
# # target_to_uniprot = target_to_uniprot.filter(col("accession") != "null")

# target_to_uniprot.show()

In [27]:
# target_to_uniprot.count()

### Protein classification by uniprot from SwissProt

In [50]:
# What does protein_classifications mean?

target_component_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_target_component.jsonl"
target_component = spark.read.json(target_component_path)
target_component.persist()
target_component.show()

+---------+------------+--------------+--------------------+-----------------------+-------------------------+
|accession|component_id|component_type|         description|protein_classifications|target_component_synonyms|
+---------+------------+--------------+--------------------+-----------------------+-------------------------+
|   P17047|       19427|       PROTEIN|Lysosome-associat...|                [{601}]|     [{CD107 antigen-l...|
|   Q05940|         207|       PROTEIN|Synaptic vesicula...|                [{706}]|     [{Monoamine trans...|
|   J7IFZ2|       17409|       PROTEIN|Dihydrofolate red...|                 [{10}]|     [{1.5.1.3, EC_NUM...|
|   Q9TU34|         848|       PROTEIN|Inositol 1,4,5-tr...|               [{1014}]|     [{Inositol 1,4,5-...|
|   Q12852|        5557|       PROTEIN|Mitogen-activated...|               [{1331}]|     [{2.7.11.25, EC_N...|
|   G3V673|       17415|       PROTEIN|A disintegrin-lik...|                  [{1}]|     [{ADAM metallopep...|
|

In [51]:
proteinclass_path = pd.read_csv("../data/uniprot2family.csv")
proteinclass_str = proteinclass_path.astype(str).drop_duplicates()
proteinclass = spark.createDataFrame(proteinclass_str)
proteinclass.show()

+---------+------------+
|accession|proteinClass|
+---------+------------+
|   P32929|      Enzyme|
|   A4D0Y5|        None|
|   Q49A92|        None|
|   Q9UFW8|        None|
|   Q96K31|        None|
|   O14646|  Epigenetic|
|   Q8IWX8|        None|
|   Q99653|        None|
|   O94983|          TF|
|   Q8NA66|        None|
|   Q96M20|        None|
|   Q86VU5|      Enzyme|
|   P42695|        None|
|   Q8IYT2|      Enzyme|
|   Q9NSA3|        None|
|   Q96KP4|      Enzyme|
|   Q13956|      Enzyme|
|   O95476|      Enzyme|
|   Q9BYD5|        None|
|   Q969H4|      Enzyme|
+---------+------------+
only showing top 20 rows



In [54]:
# Protein classification
proteinclass_list = ["proteinClass"]
uniprot_to_class = join_dataframes(target_to_uniprot_extr, proteinclass, "accession", "accession", proteinclass_list).persist()
uniprot_to_class.show()
uniprot_to_class.count()

23/12/11 14:05:06 WARN CacheManager: Asked to cache already cached data.


+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------+----------------------+--------------+--------------------+--------------+---------+------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|   target_components|   target_type|accession|proteinClass|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+--------

18860

In [53]:
show_unique_values_and_counts(uniprot_to_class, 'proteinClass')



+--------------+-----+
|  proteinClass|count|
+--------------+-----+
|        Enzyme| 5476|
|          GPCR| 1789|
|        Kinase| 7317|
|            TF|  209|
|            IC|  359|
|    Epigenetic|  769|
|          None| 1664|
|   Transporter|  583|
|            NR|  660|
|TF; Epigenetic|   29|
|          null|    5|
+--------------+-----+



                                                                                

In [33]:
# uniprot_to_class.write.parquet("data/analysis/v2/uniprot_to_class_temp_v1")

# Data processing

## Activity threshold

In [120]:
# Check how many assays have pchembl_value
pchembl_value_only = uniprot_to_class.filter(uniprot_to_class["pchembl_value"].isNotNull())
pchembl_value_only.show()
pchembl_value_only.count()

+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+------

6250

In [140]:
# Where pchembl_value is available for each T-D pairs make new columns:
#     1. max_pchembl_value
#     2. median_pchembl_value

from pyspark.sql.functions import col, max, expr

# Group by 'target_chembl_id' and 'drugId' and calculate max and median of 'pchembl_value'
pchembl_value_aggr = pchembl_value_only.groupBy("target_chembl_id", "drugId")\
                  .agg(max("pchembl_value").alias("max_pchembl_value"),
                       expr("percentile_approx(pchembl_value, 0.5)").alias("median_pchembl_value"))

pchembl_value_max_med = pchembl_value_only.join(pchembl_value_aggr, ["target_chembl_id", "drugId"])\
                    .filter(pchembl_value_aggr["max_pchembl_value"].isNotNull())

pchembl_value_max_med.show()
pchembl_value_max_med.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|
+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+--------

6250

In [141]:
pchembl_value_aggr.count()

4647

In [143]:
# Custom activity cutoff for max_pchembl_value
drug_active = pchembl_value_max_med.withColumn(
    "isActive",
    when(
        ((col("proteinClass") == "Kinase") & (col("max_pchembl_value") >= 7.7)) |
        ((col("proteinClass") == "GPCR") & (col("max_pchembl_value") >= 6.5)) |
        ((col("proteinClass") == "NR") & (col("max_pchembl_value") >= 6.1)) |
        ((col("proteinClass") == "Transporter") & (col("max_pchembl_value") >= 6.1)) |
        ((col("proteinClass") == "Enzyme") & (col("max_pchembl_value") >= 5.2)) |
        ((col("proteinClass") == "IC") & (col("max_pchembl_value") >= 4.6)) |
        ((col("proteinClass") == "Other") & (col("max_pchembl_value") >= 6.3)) |
        (~(col("proteinClass").isin(["Kinase", "GPCR", "NR", "Transporter", "Enzyme", "IC", "Other"])) & (col("max_pchembl_value") >= 5)),
        "TRUE"
    ).otherwise("FALSE")
)
drug_active.show()
drug_active.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|
+----------------+-------------+--------------+-------------------------+----------+--------------------+-----------

6250

In [144]:
drug_active_only = drug_active.filter(drug_active["isActive"] == True)
drug_active_only.show()
drug_active_only.count()

                                                                                

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|
+----------------+-------------+--------------+-------------------------+----------+--------------------+-----------

3871

In [149]:
show_unique_values_and_counts(target_organism_filter.drop_duplicates(["drugId", "isApproved"]), "isApproved")



+----------+-----+
|isApproved|count|
+----------+-----+
|      null| 1160|
|      true| 1318|
|     false|  895|
+----------+-----+



                                                                                

In [150]:
show_unique_values_and_counts(drug_active_only.drop_duplicates(["drugId", "isApproved"]), "isApproved")

[Stage 1808:>                                                       (0 + 1) / 1]

+----------+-----+
|isApproved|count|
+----------+-----+
|      null|  638|
|      true|  561|
|     false|  430|
+----------+-----+



                                                                                

In [123]:
# Where pchembl_value is not available for each T-D pairs make new columns:
#     1. Calculate how much data is this
#     2. Think about what to do with different standard_units
#     3. Ideally:
#         1. max_standard_value_n
#         2. median_standard_value_n
#         3. Cutoff for every n

no_pchembl_value = uniprot_to_class.filter(uniprot_to_class["pchembl_value"].isNull())
show_unique_values_and_counts(no_pchembl_value, "standard_type")

+--------------------+-----+
|       standard_type|count|
+--------------------+-----+
|     Drug metabolism|   86|
|                Emax|   68|
|          Ratio IC50|   38|
|                IC50| 1654|
|            Activity|  970|
|Thermal melting c...|  108|
|                  Ki| 1034|
|                EC50|   72|
|          Ratio EC50|    5|
|          Inhibition| 1449|
|                GI50|   29|
|                  Km|   35|
|                  Ke|   16|
|                  Kd| 4707|
|                Imax|    8|
|   Residual Activity|  980|
|                  FC|   82|
|               pIC50|    2|
|             Potency|  537|
|              Kinact|   56|
+--------------------+-----+
only showing top 20 rows



In [None]:
# Make column with activity of molecule: drugActive = TRUE/FALSE
#     1. based protein type and:
#         1. max_pchembl_value
#         2. median_pchembl_value
#     2. based on cutoffs for other experiment types

### Drug-target pairs: Target is in MoA of a drug (targetInMoA, boolean)

In [34]:
# # Targets dataset to map accession to Ensembl
# target_path = "gs://open-targets-data-releases/23.12/output/etl/json/targets"
# target = spark.read.json(target_path)
# target.persist()
# target.show()


In [152]:
mechanism_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_mechanism.jsonl"
mechanism = spark.read.json(mechanism_path)
mechanism.persist()
mechanism.show()

+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|       action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL2103825],...|         INHIBITOR|Pancreatic lipase...|[{16953261, PubMe...|     CHEMBL2103825|            CHEMBL2103825|  1699800|      CHEMBL1812|
|{[CHEMBL1200495, ...|           AGONIST|Glucocorticoid re...|[{setid=6d9bf1b0-...|     CHEMBL1200495|                CHEMBL977|  1344612|      CHEMBL2034|
|{[CHEMBL3544919],...|SEQUESTERING AGENT|Heparin sequester...|[{26937198, PubMe...|     CHEMBL3544919|            CHEMBL3544919|  2473107|   CHEMBL2364712|
|{[CHEMBL3989993],...|         INHIBITOR|microRNA-155 inhi...|[{

23/12/11 16:34:31 WARN CacheManager: Asked to cache already cached data.


In [61]:
# # Filtering just to check that molecule_chembl_id can correspond to several targets
# mechanism_filtered = mechanism.filter(mechanism["molecule_chembl_id"] == "CHEMBL1946170")
# mechanism_filtered.show()

+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|           _metadata|action_type| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id|
+--------------------+-----------+--------------------+--------------------+------------------+-------------------------+---------+----------------+
|{[CHEMBL1946170],...|  INHIBITOR|Platelet-derived ...|[{REGORAFENIB, Ex...|     CHEMBL1946170|            CHEMBL1946170|  1679381|   CHEMBL2095189|
|{[CHEMBL1946170],...|  INHIBITOR|Tyrosine-protein ...|[{REGORAFENIB, Ex...|     CHEMBL1946170|            CHEMBL1946170|  1679381|      CHEMBL4223|
|{[CHEMBL1946170],...|  INHIBITOR|Discoidin domain-...|[{REGORAFENIB, Ex...|     CHEMBL1946170|            CHEMBL1946170|  1679381|      CHEMBL5122|
|{[CHEMBL1946170],...|  INHIBITOR|Tyrosine-protein ...|[{REGORAFENIB, Ex...|     CHEMBL1946170|           

In [153]:
# Rename columns
mechanism_renamed = mechanism.withColumnRenamed("action_type", "action_type_moa").withColumnRenamed("target_chembl_id", "target_chembl_id_moa")

# Show the result
mechanism_renamed.show()

+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+--------------------+
|           _metadata|   action_type_moa| mechanism_of_action|      mechanism_refs|molecule_chembl_id|parent_molecule_chembl_id|record_id|target_chembl_id_moa|
+--------------------+------------------+--------------------+--------------------+------------------+-------------------------+---------+--------------------+
|{[CHEMBL2103825],...|         INHIBITOR|Pancreatic lipase...|[{16953261, PubMe...|     CHEMBL2103825|            CHEMBL2103825|  1699800|          CHEMBL1812|
|{[CHEMBL1200495, ...|           AGONIST|Glucocorticoid re...|[{setid=6d9bf1b0-...|     CHEMBL1200495|                CHEMBL977|  1344612|          CHEMBL2034|
|{[CHEMBL3544919],...|SEQUESTERING AGENT|Heparin sequester...|[{26937198, PubMe...|     CHEMBL3544919|            CHEMBL3544919|  2473107|       CHEMBL2364712|
|{[CHEMBL3989993],...|         INHIBITOR

In [154]:
# List of columns from target table
list_mechanism = ["action_type_moa",
            "target_chembl_id_moa"]

drug_to_moa = join_dataframes(drug_active, mechanism_renamed, "drugId", "molecule_chembl_id", list_mechanism).persist()
drug_to_moa.show()
drug_to_moa.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|action_type_moa|target_chembl_id_moa|
+----------------+-------------+----------

9341

In [39]:
# drug_to_moa_filtered = drug_to_moa.filter(mechanism["molecule_chembl_id"] == "CHEMBL1946170")
# drug_to_moa_filtered.show()

In [155]:
# Match target_chembl_id and target_chembl_id_moa within 1 drugId

from pyspark.sql.functions import col, broadcast, when

# Create a DataFrame of unique target_chembl_id_moa values
moa_df = drug_to_moa.select("target_chembl_id_moa").distinct().withColumnRenamed("target_chembl_id_moa", "moa_value")

# Join with the original DataFrame to check if target_chembl_id is in the list of moa_values
df_joined = drug_to_moa.join(broadcast(moa_df), drug_to_moa["target_chembl_id"] == moa_df["moa_value"], "left_outer")

# Add a new column targetInMoA based on the condition
drug_to_moa_targetInMoA = df_joined.withColumn("targetInMoA", 
                                           when(df_joined["target_chembl_id_moa"].isNull(), None)
                                           .otherwise(col("moa_value").isNotNull())
                                          ).select(drug_to_moa.columns + ["targetInMoA"])

# Show the result
drug_to_moa_targetInMoA.show()
drug_to_moa_targetInMoA.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|action_type_moa|target_chembl_id_moa|targetInMoA|
+----------------+

9341

In [42]:
# drug_to_moa_targetInMoA_filtered = drug_to_moa_targetInMoA.filter(drug_to_moa_targetInMoA["drugId"] == "CHEMBL487273")
# drug_to_moa_targetInMoA_filtered.show()

### Targets: Evidence type: sources + classification for GE, clinical_GE, probes (boolean)

In [168]:
# Irene's table
evidence_path = "gs://ot-team/irene/drug_to_target"
evidence = spark.read.parquet(evidence_path)
evidence.persist()
evidence.show()


+----------+---------+---------------+--------------------+------------------+-------------------+
|    drugId|uniprotId|       targetId|             sources|isHighQualityProbe|isTherapeuticTarget|
+----------+---------+---------------+--------------------+------------------+-------------------+
|CHEMBL1000|   O00167|ENSG00000064655|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O00555|ENSG00000141837|[uniprot_literatu...|             false|              false|
|CHEMBL1000|   O14633|ENSG00000159455|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   O60706|ENSG00000069431|            [chembl]|             false|               true|
|CHEMBL1000|   P00352|ENSG00000165092|[ot_genetics_portal]|             false|              false|
|CHEMBL1000|   P01567|ENSG00000214042|            [chembl]|             false|               true|
|CHEMBL1000|   P04155|ENSG00000160182|[ot_genetics_portal]|             false|              false|
|CHEMBL100

23/12/11 16:48:49 WARN CacheManager: Asked to cache already cached data.


In [158]:
def join_dataframes_by_many_cols(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_columns: list, 
                    second_key_columns: list,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_columns (list): The key column names in the initial DataFrame.
    second_key_columns (list): The key column names in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Ensure the key columns lists have the same length
    if len(initial_key_columns) != len(second_key_columns):
        raise ValueError("Key columns lists must be of the same length")

    # Selecting specified columns from the second DataFrame, including its key columns
    second_df_selected = second_df.select(second_key_columns + columns_to_join)

    # Build join condition
    join_condition = [initial_df[initial_col] == second_df_selected[second_col] 
                      for initial_col, second_col in zip(initial_key_columns, second_key_columns)]

    # Perform the left join
    joined_df = initial_df.join(second_df_selected, 
                                on=join_condition, 
                                how='left')

    # Drop the second key columns if not needed
    for col in second_key_columns:
        joined_df = joined_df.drop(second_df_selected[col])

    return joined_df


In [159]:
# Join by drugId and uniprotId = accession
evidence_list = ["targetId", "sources", "isHighQualityProbe", "isTherapeuticTarget"]

target_evidence = join_dataframes_by_many_cols(drug_to_moa_targetInMoA, 
                                            evidence, 
                                            ["drugId", "accession"], 
                                            ["drugId", "uniprotId"], 
                                             evidence_list).persist()
target_evidence.show()

23/12/11 16:35:49 WARN MemoryStore: Not enough space to cache broadcast_481 in memory! (computed 6.6 GiB so far)
23/12/11 16:35:49 WARN BlockManager: Persisting block broadcast_481 to disk instead.
23/12/11 16:36:25 WARN MemoryStore: Not enough space to cache broadcast_481 in memory! (computed 6.6 GiB so far)
[Stage 1927:>                                                       (0 + 1) / 1]

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median_pchembl_value|isActive|acti

                                                                                

In [160]:
# Classification of targets by genetic evidence support
from pyspark.sql.types import BooleanType

def is_ge(sources):
    undesired_lists = [
        ['chembl'],
        ['chemicalProbes'],
        ['chembl', 'chemicalProbes'],
        ['chemicalProbes', 'chembl']
    ]
    return not (sources in undesired_lists or sources is None)

def contains_chemical_probes(sources):
    return 'chemicalProbes' in sources if sources else False

# Register UDFs
is_ge_udf = udf(is_ge, BooleanType())
is_ge_clinical_udf = udf(is_ge_clinical, BooleanType())
contains_chemical_probes_udf = udf(contains_chemical_probes, BooleanType())

In [161]:
# Apply UDFs to create new columns
target_evidence_bool = target_evidence.withColumn("isGE", is_ge_udf("sources"))\
                        .withColumn("isGE_clinical", is_ge_clinical_udf("sources"))\
                        .withColumn("isProbe", contains_chemical_probes_udf("sources"))

target_evidence_bool.show()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

                                                                                

In [170]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

483

In [171]:
drug_active_only_probes = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True).filter(target_evidence_bool["isActive"] == True)
count_unique_values(drug_active_only_probes, "drugId")

                                                                                

131

In [164]:
count_unique_values(target_evidence_bool, "drugId")

                                                                                

2287

In [165]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] != "true")
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

596

In [166]:
target_evidence_bool_clinical = target_evidence_bool.filter(target_evidence_bool["isApproved"] == True)
# target_evidence_bool_clinical.show()
count_unique_values(target_evidence_bool_clinical, "drugId")

                                                                                

850

In [111]:
from pyspark.sql.functions import countDistinct

# Group by 'isApproved' and count distinct 'drugId's
drugId_count = target_evidence_bool.groupBy("isApproved").agg(countDistinct("drugId").alias("unique_drugId_count"))

# Show the result
drugId_count.show()




+----------+-------------------+
|isApproved|unique_drugId_count|
+----------+-------------------+
|      null|               1160|
|      true|               1318|
|     false|                895|
+----------+-------------------+



                                                                                

In [105]:
target_evidence_bool_probe_h = target_evidence_bool.filter(target_evidence_bool["isHighQualityProbe"] == True)
# target_evidence_bool_probe_h.show()
count_unique_values(target_evidence_bool_probe_h, "drugId")

187

In [106]:
target_evidence_bool_probe = target_evidence_bool.filter(target_evidence_bool["isProbe"] == True)
# target_evidence_bool_probe.show()
count_unique_values(target_evidence_bool_probe, "drugId")

                                                                                

695

In [175]:
# No targets in MoA
target_evidence_no_moa = target_evidence_bool\
                        .filter(target_evidence_bool["target_chembl_id_moa"].isNull())\
                        .filter(target_evidence_bool["IsActive"] == True)
show_unique_values_and_counts(target_evidence_no_moa.drop_duplicates(["drugId", "isApproved"]), "isApproved")

+----------+-----+
|isApproved|count|
+----------+-----+
|      null|  638|
|      true|  226|
|     false|  186|
+----------+-----+



In [176]:
# No targets in MoA for probes
target_evidence_no_moa_probes = target_evidence_bool\
                        .filter(target_evidence_bool["target_chembl_id_moa"].isNull())\
                        .filter(target_evidence_bool["IsActive"] == True)\
                        .filter(target_evidence_bool["isProbe"] == True)
count_unique_values(target_evidence_no_moa_probes, "drugId")

                                                                                

438

# Data coverage

### Drugs

In [None]:
# Filtering by:
#         1. max_phase ≠ 4 | max_phase = 4 | probes = TRUE
#         2. moa = NaN | moa ≠ NaN
#         3. drugActive = TRUE
#         4. GE = TRUE | clinical_GE = TRUE

In [None]:
# Number of clinical candidates/approved drugs/chemical probes for which:
#     1. there is no MoA and they are bioactive against some targets:
#         1. which have GE/GE+clinical evidence for any disease
#     2. there is MoA but they are bioactive against some other targets
#         1. which have GE/GE+clinical evidence for any disease

### Targets

In [177]:
# Dataset with only pchembl value activity
target_evidence_bool.show()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

In [179]:
show_unique_values_and_counts(target_evidence_bool, "drugType")

+---------------+-----+
|       drugType|count|
+---------------+-----+
| Small molecule| 9218|
|Oligosaccharide|    2|
|           null|    1|
|        Protein|  113|
|       Antibody|    4|
|        Unknown|   14|
+---------------+-----+



In [182]:
# Number of targets supported by GE which have active bioassays

targets_GE = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)

count_unique_values(targets_GE, "target_chembl_id")

                                                                                

326

In [184]:
# Number of targets supported by GE_clinical which have active bioassays

targets_GE_clinical = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)

count_unique_values(targets_GE_clinical, "target_chembl_id")

                                                                                

431

In [185]:
# Number of targets supported by GE which have active bioassays and not in MoA of the drug

targets_GE_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_noMoA, "target_chembl_id")

                                                                                

115

In [186]:
# Number of targets supported by GE_clinical which have active bioassays and not in MoA of the drug

targets_GE_clinical_noMoA = target_evidence_bool\
                .filter(target_evidence_bool["IsActive"] == True)\
                .filter(target_evidence_bool["isGE_clinical"] == True)\
                .filter(target_evidence_bool["targetInMoA"] != True)

count_unique_values(targets_GE_clinical_noMoA, "target_chembl_id")

                                                                                

145

## Action type

In [190]:
# Taking table target_organism_filter (after pchembl filter)

target_evidence_bool.show()
target_evidence_bool.count()

+----------------+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+-----------------+--------------------+--------+---------------+--------------------+-----------+---------------+--------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|confidence_score|confidence_description|assay_category|   target_components|    target_type|accession|proteinClass|max_pchembl_value|median

9352

In [192]:
count_unique_values(target_evidence_bool, "drugId")

2287

In [191]:
show_unique_values_and_counts(target_evidence_bool.drop_duplicates(["drugId", "action_type_moa"]), "action_type_moa")

+--------------------+-----+
|     action_type_moa|count|
+--------------------+-----+
|  NEGATIVE MODULATOR|    1|
|NEGATIVE ALLOSTER...|    4|
|          ANTAGONIST|  166|
|                null| 1442|
|       BINDING AGENT|    1|
|     PARTIAL AGONIST|    8|
|           ACTIVATOR|    1|
|             BLOCKER|   42|
|    DISRUPTING AGENT|    4|
|           INHIBITOR|  491|
|           SUBSTRATE|    1|
|POSITIVE ALLOSTER...|    8|
|     CHELATING AGENT|    1|
|  POSITIVE MODULATOR|    4|
|             AGONIST|  117|
|              OPENER|    5|
|     INVERSE AGONIST|    4|
|     RELEASING AGENT|    1|
|           MODULATOR|    8|
|      REDUCING AGENT|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
show_unique_values_and_counts(drug_to_moa, "action_type")

+--------------------+------+
|         action_type| count|
+--------------------+------+
|                null|211437|
|{INHIBITOR, Negat...|   124|
|{SUBSTRATE, Carri...|    47|
|{ANTAGONIST, Bind...|    41|
|{INVERSE AGONIST,...|     2|
|{AGONIST, Binds t...|     4|
|{ACTIVATOR, Posit...|     1|
+--------------------+------+



### Drug-Target pairs

In [None]:
# With non pharmacological action
