In [105]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [106]:
spark = SparkSession.builder.getOrCreate()

# Data parsing

In [107]:
def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    # Performing the left join
    joined_df = initial_df.join(second_df_selected, 
                                initial_df[initial_key_column] == second_df_selected[second_key_column], 
                                how='left')

    # Drop the second key column if not needed
    joined_df = joined_df.drop(second_df_selected[second_key_column])

    return joined_df


In [108]:
def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [109]:
def show_unique_values_and_counts(df: DataFrame, column_name: str):
    """
    Shows unique values and their counts for a specified column in a Spark DataFrame.

    Parameters:
    df (DataFrame): The Spark DataFrame to analyze.
    column_name (str): The name of the column for which to count unique values.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    unique_values_counts = df.groupBy(column_name).count()
    unique_values_counts.show()


In [110]:
# Take list of unique drugs (obtained from target with evidence in Platform or chemProbes)
import pandas as pd

unique_drugs_pd_df = pd.read_csv("../data/drug_to_target_unique_drugs.csv")
drug_list = spark.createDataFrame(unique_drugs_pd_df)
# drug_list = spark.read.csv(drug_list_dir, header=True, inferSchema=True)
drug_list.show()

+-------------+
|       drugId|
+-------------+
|CHEMBL1200632|
|   CHEMBL1231|
|CHEMBL1233511|
|   CHEMBL1637|
|CHEMBL1743017|
| CHEMBL185885|
|CHEMBL1949708|
|CHEMBL2105675|
|CHEMBL2107826|
|CHEMBL2109673|
|CHEMBL2346976|
| CHEMBL279115|
|CHEMBL3181832|
|CHEMBL3545096|
|CHEMBL3545103|
|CHEMBL3545145|
|CHEMBL3545312|
| CHEMBL363648|
|CHEMBL3707249|
|CHEMBL3989766|
+-------------+
only showing top 20 rows



### For each drug find a max phase of clinical trial

In [111]:
molecule_path = "gs://open-targets-data-releases/23.12/output/etl/json/molecule"
molecule = spark.read.json(molecule_path)
molecule.persist()
molecule.show()



+---------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+-------------+--------------------+----------+--------------------+--------------------+-------------------------+--------------------+-------------+--------------------+--------------------+-------------------+
+---------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+-------------+--------------------+----------+--------------------+--------------------+-------------------------+--------------------+-------------+--------------------+--------------------+-------------------+
|          false|Cc1cc(CN2CCN(c3c(...|                null|                null|Small molecule drug.|Small molecule|           false|CHEMBL1086582|UUGWPYPNRZQDFO-UH...|      null|                null|                null|                     null|       CHEMBL1086582|         null|                  []|       

23/12/08 14:32:08 WARN CacheManager: Asked to cache already cached data.        


In [112]:
# List of columns from molecule table
list_molecule = ["drugType", 
                "maximumClinicalTrialPhase", 
                "isApproved", 
                "linkedTargets", 
                "linkedDiseases"]
                 
# Join list of drugs and max_phase from molecule table
drug_list_phase = join_dataframes(drug_list, molecule, "drugId", "id", list_molecule).persist()
drug_list_phase.show()

+-------------+--------------+-------------------------+----------+--------------------+--------------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+
|CHEMBL1200632|Small molecule|                      4.0|      true|             {0, []}|{3, [EFO_0003894,...|
|   CHEMBL1231|Small molecule|                      4.0|      true|{2, [ENSG00000133...|{15, [EFO_1000781...|
|CHEMBL1233511|Small molecule|                      3.0|     false|                null|  {1, [EFO_0002950]}|
|   CHEMBL1637|Small molecule|                      4.0|      true|{14, [ENSG0000010...|{54, [MONDO_00081...|
|CHEMBL1743017|      Antibody|                      2.0|     false|{1, [ENSG00000127...|{3, [EFO_0000676,...|
| CHEMBL185885|Small molecule|                      2.0|     false|                null|{2, [EFO_0009444,...|
|CHEMBL194

### For each unique drug find bioactivity data from chembl_33_activity

In [113]:
activity_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_activity.jsonl"
activity = spark.read.json(activity_path)
activity.persist()
activity.show()



+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+------------------+-------------+-------------+-----------------+-------------------+-------------+--------------+--------------+----------------+--------------------+--------------------+
|           _metadata|action_type|assay_chembl_id|assay_type|assay_variant_accession|assay_variant_mutation|data_validity_comment|data_validity_description|document_chembl_id|document_year|ligand_efficiency|molecule_chembl_id|pchembl_value|standard_flag|standard_relation|standard_text_value|standard_type|standard_units|standard_value|target_chembl_id|     target_organism|    target_pref_name|
+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+-

23/12/08 14:32:20 WARN CacheManager: Asked to cache already cached data.        


In [114]:
# List of columns from activity table
list_activity = ["assay_chembl_id",
                "assay_type",
                "action_type",
                "pchembl_value",
                "standard_type",
                "standard_units",
                "standard_value",
                "standard_relation",
                "target_organism",
                "target_pref_name",
                "target_chembl_id"]
                # "standard_flag",
                # "ligand_efficiency",
                # "assay_variant_mutation"
                # "assay_variant_accession",
                # "data_validity_comment",
                # "data_validity_description"]

In [115]:
# Join list of drugs and chembl_33_activity
drug_to_activity = join_dataframes(drug_list_phase, activity, "drugId", "molecule_chembl_id", list_activity).filter(col("assay_chembl_id").isNotNull()).persist()
drug_to_activity.show()



+------------+--------------+-------------------------+----------+-------------+------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+
|      drugId|      drugType|maximumClinicalTrialPhase|isApproved|linkedTargets|    linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|     target_organism|    target_pref_name|target_chembl_id|
+------------+--------------+-------------------------+----------+-------------+------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+
|CHEMBL110739|Small molecule|                      3.0|     false|         null|{1, [EFO_1000786]}|  CHEMBL1613803|         F|       null|         7.20|      Potency|            

                                                                                

In [116]:
# Calculate for how many drugs we have biodata
drug_list_count = count_unique_values(drug_list, 'drugId')
drug_to_activity_count = count_unique_values(drug_to_activity, 'drugId')

print("Number of unique drugs from targets dataset: ", drug_list_count)
print("Number of unique drugs with any bioactivities: ", drug_to_activity_count)



Number of unique drugs from targets dataset:  12835
Number of unique drugs with any bioactivities:  6215


                                                                                

### For each bioactivity assay find parameters from chembl_33_assay

In [117]:
assay_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_assay.jsonl"
assay = spark.read.json(assay_path)
assay.persist()
assay.show()



+--------------------+--------------+---------------+--------------------+---------------+----------+----------------------+----------------+----------------+
|           _metadata|assay_category|assay_chembl_id|      assay_organism|assay_test_type|assay_type|confidence_description|confidence_score|variant_sequence|
+--------------------+--------------+---------------+--------------------+---------------+----------+----------------------+----------------+----------------+
|{{0 - Default val...|          null|   CHEMBL688540|                null|           null|         F|  Default value - T...|               0|            null|
|{{8 - Homologous ...|  confirmatory|  CHEMBL2114731|Mycobacterium tub...|           null|         F|  Homologous single...|               8|            null|
|{{8 - Homologous ...|          null|   CHEMBL643834|                null|           null|         B|  Homologous single...|               8|            null|
|{{8 - Homologous ...|  confirmatory|  CHEMBL2

23/12/08 14:32:29 WARN CacheManager: Asked to cache already cached data.        


In [118]:
# List of columns from assay table
list_assay = ["confidence_score",
            "confidence_description",
            "assay_category"]

In [119]:
drug_to_assay = join_dataframes(drug_to_activity, assay, "assay_chembl_id", "assay_chembl_id", list_assay).persist()
drug_to_assay.show()

[Stage 256:>                                                        (0 + 1) / 1]

+------------+--------------+-------------------------+----------+-------------+------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+----------------+----------------------+--------------+
|      drugId|      drugType|maximumClinicalTrialPhase|isApproved|linkedTargets|    linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|     target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|
+------------+--------------+-------------------------+----------+-------------+------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+----------------+----------------------+--------------+
|CHEMBL110739

                                                                                

In [120]:
drug_to_assay.count()

                                                                                

168729

### Protein classification

####  temporal decision - get accession id from Barabara's table

In [122]:
# Get accession for target_chembl_id from Barbara's table

barbara_table_path = pd.read_csv("../data/drug2target_bioactivities_chembl_33_grouped.csv")
columns = ['accession', 'target_chembl_id']
filtered_table = barbara_table_path[columns].astype(str).drop_duplicates()
barbara_table = spark.createDataFrame(filtered_table)
barbara_table.show()


+---------+----------------+
|accession|target_chembl_id|
+---------+----------------+
|   Q96FL8|   CHEMBL1743126|
|   P08684|       CHEMBL340|
|   Q12809|       CHEMBL240|
|   Q9Y6L6|   CHEMBL1697668|
|   P35367|       CHEMBL231|
|   P02763|      CHEMBL4285|
|   P02768|      CHEMBL3253|
|   Q02763|      CHEMBL4128|
|   O94956|   CHEMBL1743124|
|   Q9NPD5|   CHEMBL1743121|
|   O15245|      CHEMBL5685|
|   P07550|       CHEMBL210|
|   P03372|       CHEMBL206|
|   Q92731|       CHEMBL242|
|   Q9HA47|      CHEMBL5682|
|   O75762|      CHEMBL6007|
|   O43868|      CHEMBL5780|
|   Q96SW2|   CHEMBL3763008|
|   P32320|      CHEMBL4502|
|   P27708|      CHEMBL3093|
+---------+----------------+
only showing top 20 rows



In [123]:
# Join target_chembl_id and accession from Barbara's table

list_protein = ["accession"]
              
target_to_uniprot = join_dataframes(drug_to_assay, barbara_table, "target_chembl_id", "target_chembl_id", list_protein).persist()

# from pyspark.sql.functions import col
# target_to_uniprot = target_to_uniprot.filter(col("accession") != "null")

target_to_uniprot.show()



+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+----------------+----------------------+--------------+---------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|     target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|accession|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+--------------------+--------------------+----------------+----------------+--

                                                                                

In [124]:
target_to_uniprot.count()

                                                                                

171911

In [125]:
# Protein classification by uniprot from SwissProt

proteinclass_path = pd.read_csv("../data/uniprot2family.csv")
proteinclass_str = proteinclass_path.astype(str).drop_duplicates()
proteinclass = spark.createDataFrame(proteinclass_str)
proteinclass.show()

+---------+------------+
|accession|proteinClass|
+---------+------------+
|   P32929|      Enzyme|
|   A4D0Y5|        None|
|   Q49A92|        None|
|   Q9UFW8|        None|
|   Q96K31|        None|
|   O14646|  Epigenetic|
|   Q8IWX8|        None|
|   Q99653|        None|
|   O94983|          TF|
|   Q8NA66|        None|
|   Q96M20|        None|
|   Q86VU5|      Enzyme|
|   P42695|        None|
|   Q8IYT2|      Enzyme|
|   Q9NSA3|        None|
|   Q96KP4|      Enzyme|
|   Q13956|      Enzyme|
|   O95476|      Enzyme|
|   Q9BYD5|        None|
|   Q969H4|      Enzyme|
+---------+------------+
only showing top 20 rows



In [126]:
# Protein classification
proteinclass_list = ["proteinClass"]
uniprot_to_class = join_dataframes(target_to_uniprot, proteinclass, "accession", "accession", proteinclass_list).persist()
uniprot_to_class.show()



+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-----------------+--------------+--------------+-----------------+---------------+--------------------+----------------+----------------+----------------------+--------------+---------+------------+
|       drugId|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|pchembl_value|    standard_type|standard_units|standard_value|standard_relation|target_organism|    target_pref_name|target_chembl_id|confidence_score|confidence_description|assay_category|accession|proteinClass|
+-------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+-------------+-----------------+--------------+--------------+-----------------+---------------+--------------------+-------------

                                                                                

In [127]:
show_unique_values_and_counts(uniprot_to_class, 'proteinClass')



+--------------+------+
|  proteinClass| count|
+--------------+------+
|        Enzyme|  8636|
|          None|  3936|
|          GPCR| 13056|
|        Kinase| 14621|
|            TF|   226|
|            IC|  1295|
|            NR|  1898|
|    Epigenetic|  1417|
|          null|125266|
|   Transporter|  1528|
|TF; Epigenetic|    32|
+--------------+------+



                                                                                

In [128]:
uniprot_to_class.count()

171911

In [129]:
# uniprot_to_class.write.parquet("data/analysis/v2/uniprot_to_class_temp_v1")

# Data processing

### Drug-target pairs: Target is in MoA of a drug (targetInMoA, boolean)

In [140]:
# Targets dataset to map accession to Ensembl
target_path = "gs://open-targets-data-releases/23.12/output/etl/json/targets"
target = spark.read.json(target_path)
target.persist()
target.show()




+----------------+--------------------+--------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+--------------------+--------------------+
|alternativeGenes|        approvedName|approvedSymbol|       biotype|      canonicalExons| canonicalTranscript|      chemicalProbes|          constraint|             dbXrefs|functionDescriptions|     genomicLocation|                  go|           hallmarks|          homologues|             id|        nameSynonyms|       obsoleteNames|     obsoleteSymbols|            pathways|          proteinIds|   safetyLiabilities|subcellularLocations|     

23/12/08 14:36:37 WARN CacheManager: Asked to cache already cached data.        


### Targets

In [131]:
# Evidence type: sources + classification for GE, clinical_GE, probes (boolean)



## Assay type filtering

In [132]:
# assay_type ≠ P, U

# confidence_score = 9,7

# assay_organism = human


## Activity threshold

In [133]:
# Where pchembl_value is available for each T-D pairs make new columns:
#     1. max_pchembl_value
#     2. median_pchembl_value

In [134]:
# Where pchembl_value is not available for each T-D pairs make new columns:
#     1. Calculate how much data is this
#     2. Think about what to do with different standard_units
#     3. Ideally:
#         1. max_standard_value_n
#         2. median_standard_value_n
#         3. Cutoff for every n

In [135]:
# Make column with activity of molecule: drugActive = TRUE/FALSE
#     1. based protein type and:
#         1. max_pchembl_value
#         2. median_pchembl_value
#     2. based on cutoffs for other experiment types

# Data coverage

### Drugs

In [136]:
# Filtering by:
#         1. max_phase ≠ 4 | max_phase = 4 | probes = TRUE
#         2. moa = NaN | moa ≠ NaN
#         3. drugActive = TRUE
#         4. GE = TRUE | clinical_GE = TRUE

In [137]:
# Number of clinical candidates/approved drugs/chemical probes for which:
#     1. there is no MoA and they are bioactive against some targets:
#         1. which have GE/GE+clinical evidence for any disease
#     2. there is MoA but they are bioactive against some other targets
#         1. which have GE/GE+clinical evidence for any disease

### Targets

In [138]:
# For which action type is avaliable

### Drug-Target pairs

In [139]:
# With non pharmacological action
