In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import Row
from pyspark.sql.functions import col, broadcast, when, max, expr, collect_list, concat_ws, array_contains, split
from pyspark.sql.types import BooleanType

In [3]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/18 14:26:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def count_unique_values(df: DataFrame, column_name: str) -> int:
    """
    Count unique values in a specific column of a PySpark DataFrame.

    Args:
    df (DataFrame): The PySpark DataFrame.
    column_name (str): The name of the column to analyze.

    Returns:
    int: The number of unique values in the column.
    """
    # Get distinct values in the column and count them
    unique_count = df.select(column_name).distinct().count()

    return unique_count

# Example usage
# unique_count = count_unique_values(your_dataframe, 'your_column_name')
# print(f"Number of unique values: {unique_count}")


In [5]:
def show_unique_values_and_counts(df: DataFrame, column_name: str):
    """
    Shows unique values and their counts for a specified column in a Spark DataFrame.

    Parameters:
    df (DataFrame): The Spark DataFrame to analyze.
    column_name (str): The name of the column for which to count unique values.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column {column_name} not found in DataFrame")

    unique_values_counts = df.groupBy(column_name).count()
    unique_values_counts.show()


In [6]:
# Bioactivity data from ChEMBL filtered (exact protein/homolog, assay type != P or U, human targets)
input_path = "gs://ot-team/polina/target_evidence_bool"
input = spark.read.parquet(input_path)

                                                                                

In [4]:
input.count()

                                                                                

4656

In [7]:
input.persist()

23/12/18 14:26:09 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[target_chembl_id: string, drugId: string, target_chembl_id_moa_aggr: string, pchembl_value_aggr: string, max_pchembl_value: string, median_pchembl_value: double, drugType: string, maximumClinicalTrialPhase: double, isApproved: boolean, linkedTargets: struct<count:bigint,rows:array<string>>, linkedDiseases: struct<count:bigint,rows:array<string>>, assay_chembl_id: string, assay_type: string, action_type: struct<action_type:string,description:string,parent_type:string>, target_organism: string, target_pref_name: string, data_validity_comment: string, data_validity_description: string, confidence_score: bigint, confidence_description: string, assay_category: string, target_components: array<struct<accession:string,component_description:string,component_id:bigint,component_type:string,relationship:string,target_component_synonyms:array<struct<component_synonym:string,syn_type:string>>,target_component_xrefs:array<struct<xref_id:string,xref_name:string,xref_src_db:string,xref_src_

In [41]:
input.show()

+----------------+-------------+-------------------------+--------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+--------------+---------+------------+------------+------------+---------------+-------+---------------+--------------------+------------------+-------------------+-----+-------------+-------+
|target_chembl_id|       drugId|target_chembl_id_moa_aggr|  pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   t

In [7]:
# How many targets:
#   have assays for active drugs (pchembl median)
#   not in MoA of these drugs
#   supported by GE or clinical evidence

med_notmoa_ge_clin = input\
                        .filter(input["IsActive_med"] == True)\
                        .filter(input["isInMoA"].isNull())\
                        .filter(input["isGE_clinical"] == True)\

count_unique_values(med_notmoa_ge_clin.drop_duplicates(["target_chembl_id"]), "target_chembl_id")

                                                                                

133

In [46]:
# How many targets:
#   have assays for active drugs (pchembl median)
#   not in MoA of these drugs (or MoA is unknown)
#   supported by GE

med_notmoa_ge = input\
                        .filter(input["IsActive_med"] == True)\
                        .filter((col("isInMoA").isNull()) | (col("isInMoA") == False))\
                        .filter(input["isGE"] == True)\

med_notmoa_ge_t = count_unique_values(med_notmoa_ge.drop_duplicates(["target_chembl_id"]), "target_chembl_id")
med_notmoa_ge_d = count_unique_values(med_notmoa_ge.drop_duplicates(["drugId"]), "drugId")

print("Assays: ", med_notmoa_ge.count())
print("Targets: ", med_notmoa_ge_t)
print("Drugs: ", med_notmoa_ge_d)

Assays:  636
Targets:  270
Drugs:  436


In [51]:
# How many targets:
#   have assays for active drugs (pchembl max)
#   not in MoA of these drugs (or MoA is unknown)
#   supported by GE

max_notmoa_ge = input\
                        .filter(input["IsActive_max"] == True)\
                        .filter((col("isInMoA").isNull()) | (col("isInMoA") == False))\
                        .filter(input["isGE"] == True)\

max_notmoa_ge_t = count_unique_values(max_notmoa_ge.drop_duplicates(["target_chembl_id"]), "target_chembl_id")
max_notmoa_ge_d = count_unique_values(max_notmoa_ge.drop_duplicates(["drugId"]), "drugId")

print("Assays: ", max_notmoa_ge.count())
print("Targets: ", max_notmoa_ge_t)
print("Drugs: ", max_notmoa_ge_d)

Assays:  680
Targets:  277
Drugs:  450


In [9]:
# How many targets:
#   have assays for active drugs (pchembl median)
#   not in MoA of these drugs (or MoA is unknown)
#   supported by GE and clin (chembl) evidence

max_notmoa_ge_clin = input\
                        .filter(input["IsActive_max"] == True)\
                        .filter((col("isInMoA").isNull()) | (col("isInMoA") == False))\
                        .filter(input["isGE_clinical"] == True)\

max_notmoa_ge_t_clin = count_unique_values(max_notmoa_ge_clin.drop_duplicates(["target_chembl_id"]), "target_chembl_id")
max_notmoa_ge_d_clin = count_unique_values(max_notmoa_ge_clin.drop_duplicates(["drugId"]), "drugId")

print("Assays: ", max_notmoa_ge_clin.count())
print("Targets: ", max_notmoa_ge_t_clin)
print("Drugs: ", max_notmoa_ge_d_clin)

Assays:  1051
Targets:  354
Drugs:  605


In [53]:
# How many targets:
#   have assays for active drugs (pchembl median)
#   not in MoA of these drugs (or MoA is unknown)
#   supported by GE and clin (chembl) evidence

med_notmoa_ge_clin = input\
                        .filter(input["IsActive_med"] == True)\
                        .filter((col("isInMoA").isNull()) | (col("isInMoA") == False))\
                        .filter(input["isGE_clinical"] == True)\

med_notmoa_ge_t_clin = count_unique_values(med_notmoa_ge_clin.drop_duplicates(["target_chembl_id"]), "target_chembl_id")
med_notmoa_ge_d_clin = count_unique_values(med_notmoa_ge_clin.drop_duplicates(["drugId"]), "drugId")

print("Assays: ", med_notmoa_ge_clin.count())
print("Targets: ", med_notmoa_ge_t_clin)
print("Drugs: ", med_notmoa_ge_d_clin)

Assays:  995
Targets:  347
Drugs:  588


In [52]:
show_unique_values_and_counts(med_notmoa_ge.drop_duplicates(["target_chembl_id"]), "proteinClass")


+------------+-----+
|proteinClass|count|
+------------+-----+
|          IC|   22|
|      Enzyme|   72|
|        None|   37|
|        GPCR|   42|
|      Kinase|   54|
|  Epigenetic|   18|
|          NR|   11|
| Transporter|    9|
|          TF|    5|
+------------+-----+



In [16]:
med_notmoa_ge.printSchema()

root
 |-- target_chembl_id: string (nullable = true)
 |-- drugId: string (nullable = true)
 |-- target_chembl_id_moa_aggr: string (nullable = true)
 |-- pchembl_value_aggr: string (nullable = true)
 |-- max_pchembl_value: string (nullable = true)
 |-- median_pchembl_value: double (nullable = true)
 |-- drugType: string (nullable = true)
 |-- maximumClinicalTrialPhase: double (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- linkedTargets: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- linkedDiseases: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- assay_chembl_id: string (nullable = true)
 |-- assay_type: string (nullable = true)
 |-- action_type: struct (nullable = true)
 |    |-- action_type: string (nullable = true)
 |    |-- description: s

In [43]:
med_notmoa_ge.show()

+----------------+-------------+-------------------------+--------------------+-----------------+--------------------+--------------+-------------------------+----------+--------------------+--------------------+---------------+----------+-----------+---------------+--------------------+---------------------+-------------------------+----------------+----------------------+--------------+--------------------+---------------+---------+------------+------------+------------+---------------+-------+---------------+--------------------+------------------+-------------------+----+-------------+-------+
|target_chembl_id|       drugId|target_chembl_id_moa_aggr|  pchembl_value_aggr|max_pchembl_value|median_pchembl_value|      drugType|maximumClinicalTrialPhase|isApproved|       linkedTargets|      linkedDiseases|assay_chembl_id|assay_type|action_type|target_organism|    target_pref_name|data_validity_comment|data_validity_description|confidence_score|confidence_description|assay_category|   t

In [44]:
# Flatten the structure
med_notmoa_ge_flattened = med_notmoa_ge.withColumn("linkedTargets_count", col("linkedTargets.count"))\
                .withColumn("linkedTargets_rows", col("linkedTargets.rows"))\
                .withColumn("linkedDiseases_count", col("linkedDiseases.count"))\
                .withColumn("linkedDiseases_rows", col("linkedDiseases.rows"))\
                .withColumn("action_type_action_type", col("action_type.action_type"))\
                .withColumn("action_type_description", col("action_type.description"))\
                .withColumn("action_type_parent_type", col("action_type.parent_type"))\
                .withColumn("sources_concated", concat_ws(", ", "sources"))\
                .drop("sources")\
                .withColumn("linkedTargets_rows_concated", concat_ws(", ", "linkedTargets_rows"))\
                .drop("linkedTargets_rows")\
                .withColumn("linkedDiseases_rows_concated", concat_ws(", ", "linkedDiseases_rows"))\
                .drop("linkedDiseases_rows")\
                .drop("linkedTargets")\
                .drop("linkedDiseases")\
                .drop("action_type")\
                .drop("target_components")

# med_notmoa_ge_flattened.printSchema()
med_notmoa_ge_flattened.coalesce(1).write.csv("files/med_notmoa_ge_v2.csv", header=True)

In [39]:
med_notmoa_ge_flattened.columns

['target_chembl_id',
 'drugId',
 'target_chembl_id_moa_aggr',
 'pchembl_value_aggr',
 'max_pchembl_value',
 'median_pchembl_value',
 'drugType',
 'maximumClinicalTrialPhase',
 'isApproved',
 'assay_chembl_id',
 'assay_type',
 'target_organism',
 'target_pref_name',
 'data_validity_comment',
 'data_validity_description',
 'confidence_score',
 'confidence_description',
 'assay_category',
 'target_type',
 'accession',
 'proteinClass',
 'isActive_max',
 'isActive_med',
 'action_type_moa',
 'isInMoA',
 'targetId',
 'isHighQualityProbe',
 'isTherapeuticTarget',
 'isGE',
 'isGE_clinical',
 'isProbe',
 'linkedTargets_count',
 'linkedDiseases_count',
 'action_type_action_type',
 'action_type_description',
 'action_type_parent_type',
 'sources_concated',
 'linkedTargets_rows_concated',
 'linkedDiseases_rows_concated']

In [37]:
med_notmoa_ge.count()

144

In [36]:
med_notmoa_ge_flattened.count()

144

In [36]:
def join_dataframes(initial_df: DataFrame, 
                    second_df: DataFrame, 
                    initial_key_column: str, 
                    second_key_column: str,
                    columns_to_join: list) -> DataFrame:
    """
    Joins two PySpark DataFrames on specified key columns.

    Args:
    initial_df (DataFrame): The initial PySpark DataFrame.
    second_df (DataFrame): The second PySpark DataFrame to join with.
    initial_key_column (str): The key column name in the initial DataFrame.
    second_key_column (str): The key column name in the second DataFrame.
    columns_to_join (list): List of column names from the second DataFrame to include in the join.

    Returns:
    DataFrame: The resulting DataFrame after the join.
    """

    # Selecting specified columns from the second DataFrame, including its key column
    second_df_selected = second_df.select([second_key_column] + columns_to_join)

    # Performing the left join
    joined_df = initial_df.join(second_df_selected, 
                                initial_df[initial_key_column] == second_df_selected[second_key_column], 
                                how='left')

    # Drop the second key column if not needed
    joined_df = joined_df.drop(second_df_selected[second_key_column])

    return joined_df


                                                                                