In [15]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [None]:
spark = SparkSession.builder.getOrCreate()

# Data parsing

In [11]:
# Universal function which joins 2 spark dataframes

def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    # Performing the left join
    joined_df = initial_df.join(second_df_selected, 
                                initial_df[initial_key_column] == second_df_selected[second_key_column], 
                                how='left')

    # Drop the second key column if not needed
    joined_df = joined_df.drop(second_df_selected[second_key_column])

    return joined_df


In [35]:
# Function which calculates the number of unique values in the column and shows them

def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [6]:
# Take list of unique drugs (obtained from target with evidence in Platform or chemProbes)

drug_list_dir = "/Users/polina/Documents/Bioactivity/bioactivity-1/data/drug_to_target_unique_drugs.csv"
drug_list = spark.read.csv(drug_list_dir, header=True, inferSchema=True)
drug_list.show()

+-------------+
|       drugId|
+-------------+
|CHEMBL1200632|
|   CHEMBL1231|
|CHEMBL1233511|
|   CHEMBL1637|
|CHEMBL1743017|
| CHEMBL185885|
|CHEMBL1949708|
|CHEMBL2105675|
|CHEMBL2107826|
|CHEMBL2109673|
|CHEMBL2346976|
| CHEMBL279115|
|CHEMBL3181832|
|CHEMBL3545096|
|CHEMBL3545103|
|CHEMBL3545145|
|CHEMBL3545312|
| CHEMBL363648|
|CHEMBL3707249|
|CHEMBL3989766|
+-------------+
only showing top 20 rows



### For each drug find a max phase of clinical trial

In [None]:
molecule_path = "gs://open-targets-data-releases/23.12/output/etl/json/molecule"
molecule = spark.read.json(molecule_path)
molecule.persist()

In [None]:
# List of columns from molecule table
list_activity = ["max_phase", ???]
                 
# Join list of drugs and max_phase from molecule table
drug_list_phase = join_dataframes(drug_list, molecule, "drugId", "???", list_molecule).persist()
drug_list_phase.show()

### For each unique drug find bioactivity data from chembl_33_activity

In [7]:
activity_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_activity.jsonl"
activity = spark.read.json(activity_path)
activity.persist()
activity.show()

                                                                                

In [30]:
# List of columns from activity table
list_activity = ["assay_chembl_id",
                "assay_type",
                "action_type",
                "pchembl_value",
                "standard_type",
                "standard_units",
                "standard_value",
                "standard_relation",
                "target_organism",
                "target_pref_name",
                "target_chembl_id"]
                # "standard_flag",
                # "ligand_efficiency",
                # "assay_variant_mutation"
                # "assay_variant_accession",
                # "data_validity_comment",
                # "data_validity_description"]

In [34]:
# Join list of drugs and chembl_33_activity
drug_to_activity = join_dataframes(drug_list_phase, activity, "drugId", "molecule_chembl_id", list_activity).filter(col("assay_chembl_id").isNotNull()).persist()
drug_to_activity.show()

+-------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+
|       drugId|assay_chembl_id|assay_type|action_type|pchembl_value|standard_type|standard_units|standard_value|standard_relation|
+-------------+---------------+----------+-----------+-------------+-------------+--------------+--------------+-----------------+
|    CHEMBL106|   CHEMBL958797|         F|       NULL|         NULL|        MIC80|       ug.mL-1|           2.0|                =|
|   CHEMBL1043|   CHEMBL926687|         A|       NULL|         NULL|            F|             %|          93.0|                =|
|    CHEMBL184|   CHEMBL700641|         A|       NULL|         NULL|         IC50|            nM|      100000.0|                >|
|   CHEMBL1294|   CHEMBL860948|         A|       NULL|         7.70|         IC50|            nM|          20.0|                =|
| CHEMBL288441|  CHEMBL1244343|         B|       NULL|         7.04|         IC50| 

In [36]:
# Calculate for how many drugs we have biodata
drug_list_count = count_unique_values(drug_list, 'drugId')
drug_to_activity_count = count_unique_values(drug_to_activity, 'drugId')

print("Number of unique drugs from targets dataset: ", drug_list_count)
print("Number of unique drugs with any bioactivities: ", drug_to_activity_count)



Number of unique drugs from targets dataset:  12835
Number of unique drugs with any bioactivities:  6688


                                                                                

### For each bioactivity assay find parameters from chembl_33_assay

In [43]:
assay_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_assay.jsonl"
assay = spark.read.json(assay_path)
assay.persist()
assay.show()

                                                                                

In [46]:
# List of columns from assay table
list_assay = ["confidence_score",
            "confidence_description",
            "assay_category"]

In [48]:
drug_to_assay = join_dataframes(drug_to_activity, assay, "assay_chembl_id", "assay_chembl_id", list_activity).persist()
drug_to_assay.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `assay_chembl_id` cannot be resolved. Did you mean one of the following? [`assay_category`, `assay_organism`, `assay_type`, `assay_test_type`, `_metadata`].;
'Project ['assay_chembl_id, 'assay_chembl_id, assay_type#1964, 'action_type, 'pchembl_value, 'standard_type, 'standard_units, 'standard_value, 'standard_relation]
+- Relation [_metadata#1960,assay_category#1961,assay_organism#1962,assay_test_type#1963,assay_type#1964,confidence_description#1965,confidence_score#1966L,variant_sequence#1967] json


### Protein classification

In [None]:
# Target chembl_id to accession (uniprot_id)
target_path = "gs://open-targets-data-releases/23.12/output/etl/json/target"
target = spark.read.json(target_path)
target.persist()
target.show()

In [None]:
# List of columns from target table
list_target = ["???"]

In [None]:
target_to_uniprot = join_dataframes(drug_to_assay, target, "???", "???", list_target).persist()
target_to_uniprot.show()

In [None]:
# Mapping protein calssification from chembl_33_target_component
component_path = "gs://open-targets-pre-data-releases/chembl-columns/chembl-inputs/chembl_33_target_component.jsonl"
component = spark.read.json(component_path)
component.persist()
component.show()

In [None]:
# List of columns from component table
list_component = ["protein_classifications",
            "component_type",
            "component_id",
            "description"]

In [None]:
uniprot_to_class = join_dataframes(target_to_uniprot, component, "???", "???", list_component).persist()
uniprot_to_class.show()

# Data processing

### Drug-target pairs

In [None]:
# Target is in MoA of a drug (targetInMoA, boolean)

### Drugs

In [None]:
# List of targets for drugs from curated MoA (MoA, list)

### Targets

In [None]:
# Evidence type: sources + classification for GE, clinical_GE, probes (boolean)

## Assay type filtering

In [None]:
# assay_type ≠ P, U

# confidence_score = 9,7

# assay_organism = human


## Activity threshold

In [None]:
# Where pchembl_value is available for each T-D pairs make new columns:
#     1. max_pchembl_value
#     2. median_pchembl_value

In [None]:
# Where pchembl_value is not available for each T-D pairs make new columns:
#     1. Calculate how much data is this
#     2. Think about what to do with different standard_units
#     3. Ideally:
#         1. max_standard_value_n
#         2. median_standard_value_n
#         3. Cutoff for every n

In [None]:
# Make column with activity of molecule: drugActive = TRUE/FALSE
#     1. based protein type and:
#         1. max_pchembl_value
#         2. median_pchembl_value
#     2. based on cutoffs for other experiment types

# Data coverage

### Drugs

In [None]:
# Filtering by:
#         1. max_phase ≠ 4 | max_phase = 4 | probes = TRUE
#         2. moa = NaN | moa ≠ NaN
#         3. drugActive = TRUE
#         4. GE = TRUE | clinical_GE = TRUE

In [None]:
# Number of clinical candidates/approved drugs/chemical probes for which:
#     1. there is no MoA and they are bioactive against some targets:
#         1. which have GE/GE+clinical evidence for any disease
#     2. there is MoA but they are bioactive against some other targets
#         1. which have GE/GE+clinical evidence for any disease

### Targets

In [None]:
# For which action type is avaliable

### Drug-Target pairs

In [None]:
# With non pharmacological action
