In [15]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [None]:
spark = SparkSession.builder.getOrCreate()

# Data parsing

In [6]:
# Take list of unique drugs (obtained from target with evidence in Platform or chemProbes)

drug_list_dir = "/Users/polina/Documents/Bioactivity/bioactivity-1/data/drug_to_target_unique_drugs.csv"
drug_list = spark.read.csv(drug_list_dir, header=True, inferSchema=True)
drug_list.show()

+-------------+
|       drugId|
+-------------+
|CHEMBL1200632|
|   CHEMBL1231|
|CHEMBL1233511|
|   CHEMBL1637|
|CHEMBL1743017|
| CHEMBL185885|
|CHEMBL1949708|
|CHEMBL2105675|
|CHEMBL2107826|
|CHEMBL2109673|
|CHEMBL2346976|
| CHEMBL279115|
|CHEMBL3181832|
|CHEMBL3545096|
|CHEMBL3545103|
|CHEMBL3545145|
|CHEMBL3545312|
| CHEMBL363648|
|CHEMBL3707249|
|CHEMBL3989766|
+-------------+
only showing top 20 rows



### For each unique drug find bioactivity data from chembl_33_activity

In [7]:
activity_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_activity.jsonl"
activity = spark.read.json(activity_path)

                                                                                

In [42]:
activity.persist()

23/12/06 17:11:01 WARN CacheManager: Asked to cache already cached data.


DataFrame[_metadata: struct<assay_data:struct<assay_cell_type:string,assay_organism:string,assay_parameters:array<struct<active:bigint,comments:string,relation:string,standard_relation:string,standard_text_value:string,standard_type:string,standard_type_fixed:bigint,standard_units:string,standard_value:string,text_value:string,type:string,units:string,value:string>>,assay_strain:string,assay_subcellular_fraction:string,assay_tax_id:bigint,assay_tissue:string,assay_type:string,cell_chembl_id:string,src_desc:string,src_id:bigint,tissue_chembl_id:string,type_label:string>,document_data:struct<first_page:string,pubmed_id:bigint,volume:string,year:bigint>,organism_taxonomy:struct<l1:string,l2:string,l3:string,oc_id:bigint,tax_id:bigint>,parent_molecule_data:struct<alogp:string,compound_key:string,full_mwt:string,image_file:string,max_phase:string,max_phase_label:string,num_ro5_violations:bigint,psa:string>,protein_classification:array<struct<l1:string,l1_definition:string,l1_desc:string,l1_

In [18]:
activity.show()

+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+------------------+-------------+-------------+-----------------+-------------------+-------------+--------------+--------------+----------------+--------------------+--------------------+
|           _metadata|action_type|assay_chembl_id|assay_type|assay_variant_accession|assay_variant_mutation|data_validity_comment|data_validity_description|document_chembl_id|document_year|ligand_efficiency|molecule_chembl_id|pchembl_value|standard_flag|standard_relation|standard_text_value|standard_type|standard_units|standard_value|target_chembl_id|     target_organism|    target_pref_name|
+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+-

In [11]:
# Universal function which joins 2 spark dataframes

def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    # Performing the left join
    joined_df = initial_df.join(second_df_selected, 
                                initial_df[initial_key_column] == second_df_selected[second_key_column], 
                                how='left')

    # Drop the second key column if not needed
    joined_df = joined_df.drop(second_df_selected[second_key_column])

    return joined_df


In [30]:
# List of columns needed
list_activity = ["assay_chembl_id",
                "assay_type",
                "action_type",
                "pchembl_value",
                "standard_type",
                "standard_units",
                "standard_value",
                "standard_relation",
                "target_organism",
                "target_pref_name",
                "target_chembl_id"]
                # "standard_flag",
                # "ligand_efficiency",
                # "assay_variant_mutation"
                # "assay_variant_accession",
                # "data_validity_comment",
                # "data_validity_description"]

In [34]:
# Join list of drugs and chembl_33_activity
drug_to_activity = join_dataframes(drug_list, activity, "drugId", "molecule_chembl_id", list_activity).filter(col("assay_chembl_id").isNotNull())
drug_to_activity.show()

+-------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+
|       drugId|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|
+-------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+
|    CHEMBL106|   CHEMBL958797|         F|       NULL|         NULL|        MIC80|       ug.mL-1|           2.0|                =|
|   CHEMBL1043|   CHEMBL926687|         A|       NULL|         NULL|            F|             %|          93.0|                =|
|    CHEMBL184|   CHEMBL700641|         A|       NULL|         NULL|         IC50|            nM|      100000.0|                >|
|   CHEMBL1294|   CHEMBL860948|         A|       NULL|         7.70|         IC50|            nM|          20.0|                =|
| CHEMBL288441|  CHEMBL1244343|         B|       NULL|         7.04|         IC50| 

In [49]:
drug_to_activity.persist()

DataFrame[drugId: string, assay_chembl_id: string, assay_type: string, action_type: struct<action_type:string,description:string,parent_type:string>, pchembl_value: string, standard_type: string, standard_units: string, standard_value: string, standard_relation: string]

In [35]:
# Function which calculates the number of unique values in the column and shows them

def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [36]:
# Calculate for how many drugs we have biodata
drug_list_count = count_unique_values(drug_list, 'drugId')
drug_to_activity_count = count_unique_values(drug_to_activity, 'drugId')

print("Number of unique drugs from targets dataset: ", drug_list_count)
print("Number of unique drugs with any bioactivities: ", drug_to_activity_count)



Number of unique drugs from targets dataset:  12835
Number of unique drugs with any bioactivities:  6688


                                                                                

### For each bioactivity assay find parameters from chembl_33_assay

In [43]:
assay_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_assay.jsonl"
assay = spark.read.json(assay_path)

                                                                                

In [44]:
assay.persist()

DataFrame[_metadata: struct<assay_generated:struct<confidence_label:string,relationship_label:string,type_label:string>,document_data:struct<doi:string,first_page:string,journal:string,last_page:string,pubmed_id:bigint,title:string,volume:string,year:bigint>,es_completion:array<struct<input:string,weight:bigint>>,organism_taxonomy:struct<l1:string,l2:string,l3:string,oc_id:bigint,tax_id:bigint>,related_activities:struct<count:bigint>,related_compounds:struct<all_chembl_ids:string,count:bigint>,related_documents:struct<all_chembl_ids:string,count:bigint>,related_targets:struct<all_chembl_ids:string,count:bigint>,source:struct<src_description:string,src_id:bigint,src_short_name:string>>, assay_category: string, assay_organism: string, assay_test_type: string, assay_type: string, confidence_description: string, confidence_score: bigint, variant_sequence: struct<accession:string,isoform:bigint,mutation:string,organism:string,sequence:string,tax_id:bigint,version:bigint>]

In [47]:
assay.show()

+--------------------+--------------+--------------------+---------------+----------+----------------------+----------------+----------------+
|           _metadata|assay_category|      assay_organism|assay_test_type|assay_type|confidence_description|confidence_score|variant_sequence|
+--------------------+--------------+--------------------+---------------+----------+----------------------+----------------+----------------+
|{{0 - Default val...|          NULL|                NULL|           NULL|         F|  Default value - T...|               0|            NULL|
|{{8 - Homologous ...|  confirmatory|Mycobacterium tub...|           NULL|         F|  Homologous single...|               8|            NULL|
|{{8 - Homologous ...|          NULL|                NULL|           NULL|         B|  Homologous single...|               8|            NULL|
|{{8 - Homologous ...|  confirmatory|                NULL|           NULL|         F|  Homologous single...|               8|            NULL|

In [46]:
list_assay = ["confidence_score",
            "confidence_description",
            "assay_category"]

In [48]:
drug_to_assay = join_dataframes(drug_to_activity, assay, "assay_chembl_id", "assay_chembl_id", list_activity)
drug_to_assay.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `assay_chembl_id` cannot be resolved. Did you mean one of the following? [`assay_category`, `assay_organism`, `assay_type`, `assay_test_type`, `_metadata`].;
'Project ['assay_chembl_id, 'assay_chembl_id, assay_type#1964, 'action_type, 'pchembl_value, 'standard_type, 'standard_units, 'standard_value, 'standard_relation]
+- Relation [_metadata#1960,assay_category#1961,assay_organism#1962,assay_test_type#1963,assay_type#1964,confidence_description#1965,confidence_score#1966L,variant_sequence#1967] json


In [None]:
molecule_path = "gs://open-targets-data-releases/23.12/output/etl/json/molecule"
molecule = spark.read.json(molecule_path)

                                                                                

+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+------------------+-------------+-------------+-----------------+-------------------+-------------+--------------+--------------+----------------+--------------------+--------------------+
|           _metadata|action_type|assay_chembl_id|assay_type|assay_variant_accession|assay_variant_mutation|data_validity_comment|data_validity_description|document_chembl_id|document_year|ligand_efficiency|molecule_chembl_id|pchembl_value|standard_flag|standard_relation|standard_text_value|standard_type|standard_units|standard_value|target_chembl_id|     target_organism|    target_pref_name|
+--------------------+-----------+---------------+----------+-----------------------+----------------------+---------------------+-------------------------+------------------+-------------+-----------------+-