In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("drug-to_target_biodata_analysis") \
    .config("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/06 14:28:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
# Dataset with all datasources
biodata_all = spark.read.parquet("data/analysis/biodata_all_v2")
biodata_all.show()

+---------+------------+--------+---------------+------+--------------+--------------------+-----------+--------------------+------+--------------------+----------------+------------+--------------+--------------+-----------------+-------------+---------------+-----+------------+---------+--------------------+------------------+-------------------+------------+--------+
|accession|   chembl_id|molregno|assay_chembl_id|src_id|src_short_name|             journal|  pubmed_id|                 doi|   tid|           pref_name|target_chembl_id|    organism|standard_value|standard_units|standard_relation|pchembl_value|       targetID|isMoA|      drugId|uniprotId|             sources|isHighQualityProbe|isTherapeuticTarget|proteinClass|isActive|
+---------+------------+--------+---------------+------+--------------+--------------------+-----------+--------------------+------+--------------------+----------------+------------+--------------+--------------+-----------------+-------------+---------

### Create contingency table for biodata_all regardless source of evidence

In [4]:
contingency_table_all = (
    biodata_all
    .groupBy("isActive", "isMoA")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA")
)

print("Contingency table regardless source of evidence:")
contingency_table_all.show()
contingency_table_all_list = contingency_table_all.collect()

Contingency table regardless source of evidence:
+--------+-----+-----+
|isActive|isMoA|count|
+--------+-----+-----+
|   false|false|14123|
|   false| true|  426|
|    true|false|12439|
|    true| true| 2720|
+--------+-----+-----+



### Create contingency table for biodata_all for source of evidence = GE_plus




In [27]:
# Define a new column 'isGE_plus' based on the contents of 'sources'
biodata_GE_plus = biodata_all.withColumn(
    'isGE_plus',
    F.when((F.col('sources').isNull()) | (F.size(F.col('sources')) == 0), F.lit(False))
    .otherwise(F.lit(True))
)
biodata_GE_plus.show()

+---------+------------+--------+---------------+------+--------------+--------------------+-----------+--------------------+------+--------------------+----------------+------------+--------------+--------------+-----------------+-------------+---------------+-----+------------+---------+--------------------+------------------+-------------------+------------+--------+---------+
|accession|   chembl_id|molregno|assay_chembl_id|src_id|src_short_name|             journal|  pubmed_id|                 doi|   tid|           pref_name|target_chembl_id|    organism|standard_value|standard_units|standard_relation|pchembl_value|       targetID|isMoA|      drugId|uniprotId|             sources|isHighQualityProbe|isTherapeuticTarget|proteinClass|isActive|isGE_plus|
+---------+------------+--------+---------------+------+--------------+--------------------+-----------+--------------------+------+--------------------+----------------+------------+--------------+--------------+-----------------+---

In [28]:
# Filter the DataFrame where 'isGE_plus' is True
biodata_GE_plus_t = biodata_GE_plus.filter(F.col('isGE_plus'))

# Now create the contingency table for the subset where 'isGE_plus' is True
contingency_table_GE_plus_t = (
    biodata_GE_plus_t
    .groupBy("isActive", "isMoA")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA")
)

print("Contingency table for source of evidence = GE_plus:")
contingency_table_GE_plus_t.show()
contingency_table_GE_plus_t_list = contingency_table_GE_plus_t.collect()

Contingency table for source of evidence = GE_plus:
+--------+-----+-----+
|isActive|isMoA|count|
+--------+-----+-----+
|   false|false| 6714|
|   false| true|  425|
|    true|false| 7630|
|    true| true| 2718|
+--------+-----+-----+



In [29]:
# Filter the DataFrame for rows where 'isGE_plus' is False
biodata_GE_plus_f = biodata_GE_plus.filter(~F.col('isGE_plus'))

# Now create the contingency table for the subset where 'isGE_plus' is True
contingency_table_GE_plus_f = (
    biodata_GE_plus_f
    .groupBy("isActive", "isMoA")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA")
)

print("Contingency table for source of evidence != GE_plus:")
contingency_table_GE_plus_f.show()
contingency_table_GE_plus_f_list = contingency_table_GE_plus_f.collect()

Contingency table for source of evidence != GE_plus:
+--------+-----+-----+
|isActive|isMoA|count|
+--------+-----+-----+
|   false|false| 7409|
|   false| true|    1|
|    true|false| 4809|
|    true| true|    2|
+--------+-----+-----+



In [31]:
# 3rd dimention to GE_plus
three_way_table_GE_plus = (
    biodata_GE_plus
    .groupBy("isActive", "isMoA", "isGE_plus")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA", "isGE_plus")
)

# Show the resulting DataFrame
three_way_table_GE_plus.show()

+--------+-----+---------+-----+
|isActive|isMoA|isGE_plus|count|
+--------+-----+---------+-----+
|   false|false|    false| 7409|
|   false|false|     true| 6714|
|   false| true|    false|    1|
|   false| true|     true|  425|
|    true|false|    false| 4809|
|    true|false|     true| 7630|
|    true| true|    false|    2|
|    true| true|     true| 2718|
+--------+-----+---------+-----+



### Create contingency table for biodata_all for source of evidence = GE

In [22]:
# Define the undesired lists as column expressions for comparison
undesired_chembl = F.array([F.lit('chembl')])
undesired_chemicalProbes = F.array([F.lit('chemicalProbes')])
undesired_both = F.array([F.lit('chembl'), F.lit('chemicalProbes')])
undesired_both_alt = F.array([F.lit('chemicalProbes'), F.lit('chembl')])

# Assuming biodata_all is your DataFrame
# Add the 'isGE' column
biodata_GE = biodata_all.withColumn(
    'isGE',
    ~(
        (F.col('sources').isNull()) |
        (F.size(F.col('sources')) == 0) |
        (F.array_sort(F.col('sources')) == F.array_sort(undesired_chembl)) |
        (F.array_sort(F.col('sources')) == F.array_sort(undesired_chemicalProbes)) |
        (F.array_sort(F.col('sources')) == F.array_sort(undesired_both)) |
        (F.array_sort(F.col('sources')) == F.array_sort(undesired_both_alt))
    )
)
biodata_GE.show()

+---------+------------+--------+---------------+------+--------------+--------------------+-----------+--------------------+------+--------------------+----------------+------------+--------------+--------------+-----------------+-------------+---------------+-----+------------+---------+--------------------+------------------+-------------------+------------+--------+-----+
|accession|   chembl_id|molregno|assay_chembl_id|src_id|src_short_name|             journal|  pubmed_id|                 doi|   tid|           pref_name|target_chembl_id|    organism|standard_value|standard_units|standard_relation|pchembl_value|       targetID|isMoA|      drugId|uniprotId|             sources|isHighQualityProbe|isTherapeuticTarget|proteinClass|isActive| isGE|
+---------+------------+--------+---------------+------+--------------+--------------------+-----------+--------------------+------+--------------------+----------------+------------+--------------+--------------+-----------------+-----------

In [23]:
# Filter the DataFrame where 'isGE_plus' is True
biodata_GE_t = biodata_GE.filter(F.col('isGE'))

# Now create the contingency table for the subset where 'isGE' is True
contingency_table_GE_t = (
    biodata_GE_t
    .groupBy("isActive", "isMoA")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA")
)

print("Contingency table for source of evidence = GE:")
contingency_table_GE_t.show()
contingency_table_GE_t_list = contingency_table_GE_t.collect()

Contingency table for source of evidence = GE:
+--------+-----+-----+
|isActive|isMoA|count|
+--------+-----+-----+
|   false|false| 4667|
|   false| true|  286|
|    true|false| 2493|
|    true| true| 1696|
+--------+-----+-----+



In [24]:
# Filter the DataFrame where 'isGE_plus' is True
biodata_GE_f = biodata_GE.filter(~F.col('isGE'))

# Now create the contingency table for the subset where 'isGE' is True
contingency_table_GE_f = (
    biodata_GE_f
    .groupBy("isActive", "isMoA")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA")
)

print("Contingency table for source of evidence != GE:")
contingency_table_GE_f.show()
contingency_table_GE_f_list = contingency_table_GE_f.collect()

Contingency table for source of evidence != GE:
+--------+-----+-----+
|isActive|isMoA|count|
+--------+-----+-----+
|   false|false| 9456|
|   false| true|  140|
|    true|false| 9946|
|    true| true| 1024|
+--------+-----+-----+



In [32]:
# 3rd dimention to GE_plus
three_way_table_GE = (
    biodata_GE
    .groupBy("isActive", "isMoA", "isGE")
    .agg(F.count("*").alias("count"))
    .orderBy("isActive", "isMoA", "isGE")
)

# Show the resulting DataFrame
three_way_table_GE.show()

+--------+-----+-----+-----+
|isActive|isMoA| isGE|count|
+--------+-----+-----+-----+
|   false|false|false| 9456|
|   false|false| true| 4667|
|   false| true|false|  140|
|   false| true| true|  286|
|    true|false|false| 9946|
|    true|false| true| 2493|
|    true| true|false| 1024|
|    true| true| true| 1696|
+--------+-----+-----+-----+

