In [7]:
# ! pip install openpyxl
# ! hadoop fs -mkdir /resources
# ! wget  -nd  biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/omop_concept.tsv  && hadoop fs -put omop_concept.tsv /resources/
# ! wget  -nd  biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/omop_drug_strength.tsv && hadoop fs -put omop_drug_strength.tsv /resources/
# ! wget https://zenodo.org/records/6949696/files/OMOP2OBO_V1_Drug_Exposure_Mapping_Oct2020.xlsx
# ! hadoop fs -put RXNCONSO.RRF /resources/

[0mCollecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5
[0m

In [2]:
# NOTES
# https://www.nlm.nih.gov/research/umls/rxnorm/docs/appendix1.html
# https://www.nlm.nih.gov/research/umls/rxnorm/docs/appendix5.html
# https://www.nlm.nih.gov/research/umls/rxnorm/docs/techdoc.html#conso

In [1]:
import pyspark
import dxpy
import dxdata
import pandas as pd
import subprocess
from pathlib import Path
from pyspark.sql.functions import col

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [3]:
# Init pyspark and load dataframe
OMOP_TABLE = "omop_dose_era" # 'omop_drug_era', 'omop_drug_exposure'

# Load dose era table
ode = dataset[OMOP_TABLE]

field_names = [f.name for f in ode.fields]

df = ode.retrieve_fields(names=field_names, engine=dxdata.connect())
df = df.drop("eid")

print(f"Number of entries {df.count()}")
df.show(5)

Number of entries 34321
+-------------+---------------+---------------+----------+-------------------+-----------------+
|  dose_era_id|drug_concept_id|unit_concept_id|dose_value|dose_era_start_date|dose_era_end_date|
+-------------+---------------+---------------+----------+-------------------+-----------------+
|1425929142274|        1549786|             -1|        -1|         25/02/1998|       26/03/1998|
| 764504178721|        1589505|             -1|        -1|         16/07/1999|       14/08/1999|
|1460288880666|        1549786|             -1|        -1|         04/11/1996|       03/12/1996|
| 841813590107|        1589505|             -1|        -1|         21/12/1999|       19/01/2000|
| 738734375019|        1549786|             -1|        -1|         21/12/1999|       19/01/2000|
+-------------+---------------+---------------+----------+-------------------+-----------------+
only showing top 5 rows



In [6]:
# Get RxNorm Ingredients (IN) and associated concept_codes
conc = spark.read.csv("/resources/omop_concept.tsv", sep = "\t", header=True, inferSchema=True)
rx_ing = conc.filter(
    (conc["vocabulary_id"] == "RxNorm") & (conc["concept_class_id"] == "Ingredient") & (conc["invalid_reason"].isNull())
)
print(f"RxNorm Ingredients: {rx_ing.count()}")
print(rx_ing.columns)
rx_ing.show(3, truncate = False, vertical = False)

RxNorm Ingredients: 13137
['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code', 'valid_start_date', 'valid_end_date', 'invalid_reason']
+----------+------------------------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|concept_id|concept_name                        |domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|
+----------+------------------------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|501343    |hepatitis B immune globulin         |Drug     |RxNorm       |Ingredient      |S               |26744       |1970-01-01      |2099-12-31    |null          |
|507832    |herpesvirus 3, human                |Drug     |RxNorm       |Ingredient      |S               |11131       |1970-01-0

In [7]:
# Get distinct drug_concept_ids

dci = df.select(["drug_concept_id"]).distinct()
print(f"Number of records before join: {dci.count()}")

dci = dci.join(rx_ing, dci["drug_concept_id"] == rx_ing["concept_id"], how = 'inner')
print(f"Number of records after join: {dci.count()}")

dci.show(5, truncate = False, vertical = False)

Number of records before join: 618
Number of records after join: 618
+---------------+----------+----------------------------------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|drug_concept_id|concept_id|concept_name                                  |domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|
+---------------+----------+----------------------------------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|528323         |528323    |hepatitis B surface antigen vaccine           |Drug     |RxNorm       |Ingredient      |S               |797752      |2008-06-29      |2099-12-31    |null          |
|529660         |529660    |hepatitis A vaccine (inactivated) strain HM175|Drug     |RxNorm       |Ingredient      |S               |798361      |2008-06-2

In [8]:
###
### RxNorm Ingredient
###

# Documentation here https://www.nlm.nih.gov/research/umls/rxnorm/docs/techdoc.html#conso
# See https://www.nlm.nih.gov/research/umls/rxnorm/docs/appendix5.html for TTY
# Essentially _c11 = SAB, Source abbreviation. _c12 = TTY, Term type in source, _c16 = SUPPRESS, where N is No

conso = spark.read.csv("/resources/RXNCONSO.RRF", sep = "|", header=False, inferSchema=True)

concept_code_to_drug = conso.filter(
    (conso["_c11"] == "RXNORM") & (conso["_c12"] == "IN") & (conso["_c16"] == "N")
)


# Identify columns where all values are null
non_null_columns = [col_name for col_name in concept_code_to_drug.columns if concept_code_to_drug.filter(col(col_name).isNotNull()).count() > 0]

# Select only the columns that are not entirely null
concept_code_to_drug = concept_code_to_drug.select(*non_null_columns)

print(f"Number of RxNorm IN concepts: {concept_code_to_drug.count()}")
concept_code_to_drug.show(3, truncate = False, vertical = False)

Number of RxNorm Ingredients: 14395
+---+---+--------+--------+---+------+----+----+----------------+----+----+
|_c0|_c1|_c7     |_c8     |_c9|_c11  |_c12|_c13|_c14            |_c16|_c17|
+---+---+--------+--------+---+------+----+----+----------------+----+----+
|44 |ENG|12251526|12251526|44 |RXNORM|IN  |44  |mesna           |N   |4096|
|61 |ENG|12254378|12254378|61 |RXNORM|IN  |61  |beta-alanine    |N   |4096|
|73 |ENG|12252051|12252051|73 |RXNORM|IN  |73  |docosahexaenoate|N   |4096|
+---+---+--------+--------+---+------+----+----+----------------+----+----+
only showing top 3 rows



In [9]:
dci2 = dci.join(concept_code_to_drug, dci["concept_code"] == concept_code_to_drug["_c0"], how = 'inner')
print(dci2.count()) # We lose Senna Leaves which is obsolete
dci2.show(3, truncate = False, vertical = False)

617
+---------------+----------+------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+---+---+--------+--------+---+------+----+----+------------------+----+----+
|drug_concept_id|concept_id|concept_name      |domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|_c0|_c1|_c7     |_c8     |_c9|_c11  |_c12|_c13|_c14              |_c16|_c17|
+---------------+----------+------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+---+---+--------+--------+---+------+----+----+------------------+----+----+
|929549         |929549    |acetic acid       |Drug     |RxNorm       |Ingredient      |S               |168         |1970-01-01      |2099-12-31    |null          |168|ENG|12250948|12250948|168|RXNORM|IN  |168 |acetic acid       |N   |4096|
|1154343        |1154343   |

In [10]:
print(f"Number of entries original: {df.count()}")

print(f"Number of entries finished: {df.join(dci2, on = 'drug_concept_id', how = 'inner').count()}")

Number of entries original: 34321
Number of entries finished: 34303


In [11]:
dci.join(dci2, on = 'drug_concept_id', how = 'left_anti').show()

+---------------+----------+------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|drug_concept_id|concept_id|concept_name|domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|
+---------------+----------+------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|         992409|    992409|Senna leaves|     Drug|       RxNorm|      Ingredient|               S|      237929|      1970-01-01|    2099-12-31|          null|
+---------------+----------+------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+



In [12]:
dci2.toPandas().to_csv("dose_era_ingredients.tsv", sep = "\t")

In [13]:
###
### ATC codes
###
conso = spark.read.csv("/resources/RXNCONSO.RRF", sep = "|", header=False, inferSchema=True)

concept_code_to_atc = conso.filter(
    (conso["_c11"] == "ATC") & (conso["_c12"] == "IN") & (conso["_c16"] == "N")
)


# Identify columns where all values are null
non_null_columns = [col_name for col_name in concept_code_to_atc.columns if concept_code_to_atc.filter(col(col_name).isNotNull()).count() > 0]

# Select only the columns that are not entirely null
concept_code_to_atc = concept_code_to_atc.select(*non_null_columns)

print(f"Number of ATC codes: {concept_code_to_atc.count()}")
concept_code_to_atc.show(3, truncate = False, vertical = False)

Number of ATC codes: 5495
+---+---+-------+----+----+-------+--------+----+
|_c0|_c1|_c7    |_c11|_c12|_c13   |_c14    |_c16|
+---+---+-------+----+----+-------+--------+----+
|44 |ENG|5481074|ATC |IN  |R05CB05|mesna   |N   |
|44 |ENG|5481075|ATC |IN  |V03AF01|mesna   |N   |
|63 |ENG|5481013|ATC |IN  |C10AX05|meglutol|N   |
+---+---+-------+----+----+-------+--------+----+
only showing top 3 rows



In [21]:
dci3 = dci.join(concept_code_to_atc, dci["concept_code"] == concept_code_to_atc["_c0"], how = 'inner')

print(dci3.count())
dci3.show(3, truncate = False, vertical = False)

832
+---------------+----------+------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+---+---+-------+----+----+-------+-----------+----+
|drug_concept_id|concept_id|concept_name|domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|_c0|_c1|_c7    |_c11|_c12|_c13   |_c14       |_c16|
+---------------+----------+------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+---+---+-------+----+----+-------+-----------+----+
|929549         |929549    |acetic acid |Drug     |RxNorm       |Ingredient      |S               |168         |1970-01-01      |2099-12-31    |null          |168|ENG|5478476|ATC |IN  |G01AD02|acetic acid|N   |
|929549         |929549    |acetic acid |Drug     |RxNorm       |Ingredient      |S               |168         |1970-01-01      |2099-12-31    |null    

In [15]:
print(f"Number of entries original: {df.count()}")

print(f"Number of entries finished: {df.join(dci3, on = 'drug_concept_id', how = 'inner').count()}")
print(f"Number of distinct finished: {df.join(dci3, on = 'drug_concept_id', how = 'inner').select('drug_concept_id').distinct().count()}")

Number of entries original: 34321
Number of entries finished: 72093
Number of distinct finished: 509


In [16]:
atc_missing = dci.join(dci3, on = 'drug_concept_id', how = 'left_anti')

print(f"Missing ATC: {atc_missing.count()}")
print(f"Records with ATC codes: {df.join(atc_missing, on = 'drug_concept_id', how = 'left_anti').count()}")


SHOW_MISSING = False
N_SHOW = 10

if SHOW_MISSING:
    dci_anti = atc_missing.show(N_SHOW)

Missing ATC: 109
Records with ATC codes: 31409


In [18]:
dci3.toPandas().to_csv("dose_era_atc.tsv", sep = "\t")

In [20]:
###
### DrugBank
###
conso = spark.read.csv("/resources/RXNCONSO.RRF", sep = "|", header=False, inferSchema=True)

concept_code_to_db = conso.filter(
    (conso["_c11"] == "DRUGBANK") & (conso["_c12"] == "IN") & (conso["_c16"] == "N")
)


# Identify columns where all values are null
non_null_columns = [col_name for col_name in concept_code_to_db.columns if concept_code_to_db.filter(col(col_name).isNotNull()).count() > 0]

# Select only the columns that are not entirely null
concept_code_to_db = concept_code_to_db.select(*non_null_columns)

print(f"Number of DrugBank identifier: {concept_code_to_db.count()}")
concept_code_to_db.show(3, truncate = False, vertical = False)


Number of DrugBank identifier: 10715
+---+---+--------+-------+--------+----+-------+------------------------------+----+
|_c0|_c1|_c7     |_c9    |_c11    |_c12|_c13   |_c14                          |_c16|
+---+---+--------+-------+--------+----+-------+------------------------------+----+
|48 |ENG|12855631|DB18029|DRUGBANK|IN  |DB18029|24,25-Dihydroxycholecalciferol|N   |
|60 |ENG|10894803|DB01509|DRUGBANK|IN  |DB01509|Tenamfetamine                 |N   |
|74 |ENG|9733930 |DB02362|DRUGBANK|IN  |DB02362|Aminobenzoic acid             |N   |
+---+---+--------+-------+--------+----+-------+------------------------------+----+
only showing top 3 rows



In [22]:
dci4 = dci.join(concept_code_to_db, dci["concept_code"] == concept_code_to_db["_c0"], how = 'inner')

print(dci4.count()) 
dci4.show(3, truncate = False, vertical = False)

552
+---------------+----------+------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+---+---+--------+-------+--------+----+-------+------------------+----+
|drug_concept_id|concept_id|concept_name      |domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|_c0|_c1|_c7     |_c9    |_c11    |_c12|_c13   |_c14              |_c16|
+---------------+----------+------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+---+---+--------+-------+--------+----+-------+------------------+----+
|929549         |929549    |acetic acid       |Drug     |RxNorm       |Ingredient      |S               |168         |1970-01-01      |2099-12-31    |null          |168|ENG|8393346 |DB03166|DRUGBANK|IN  |DB03166|Acetic acid       |N   |
|1154343        |1154343   |albuterol         |D

In [24]:
print(f"Number of entries original: {df.count()}")
print(f"Number of entries finished: {df.join(dci4, on = 'drug_concept_id', how = 'inner').count()}")
print(f"Number of distinct finished: {df.join(dci4, on = 'drug_concept_id', how = 'inner').select('drug_concept_id').distinct().count()}")

drugbank_missing = dci.join(dci4, on = 'drug_concept_id', how = 'left_anti')

print(f"Missing DrugBank ID: {drugbank_missing.count()}")
print(f"Records with DrugBank codes: {df.join(drugbank_missing, on = 'drug_concept_id', how = 'left_anti').count()}")


SHOW_MISSING = True
N_SHOW = 10

if SHOW_MISSING:
    dci_anti = drugbank_missing.show(N_SHOW)

Number of entries original: 34321
Number of entries finished: 31022
Number of distinct finished: 551
Missing DrugBank ID: 67
Records with DrugBank codes: 31018
+---------------+----------+--------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|drug_concept_id|concept_id|        concept_name|domain_id|vocabulary_id|concept_class_id|standard_concept|concept_code|valid_start_date|valid_end_date|invalid_reason|
+---------------+----------+--------------------+---------+-------------+----------------+----------------+------------+----------------+--------------+--------------+
|         528323|    528323|hepatitis B surfa...|     Drug|       RxNorm|      Ingredient|               S|      797752|      2008-06-29|    2099-12-31|          null|
|         529660|    529660|hepatitis A vacci...|     Drug|       RxNorm|      Ingredient|               S|      798361|      2008-06-29|    2099-12-31|          null|


In [25]:
dci4.toPandas().to_csv("dose_era_drugbank.tsv", sep = "\t")

In [5]:
###
### Chebi / OMOP2OBO
###

chebi = pd.read_excel("OMOP2OBO_V1_Drug_Exposure_Mapping_Oct2020.xlsx", sheet_name="OMOP2OBO_ChEBI_Mapping_Results")
chebi['ONTOLOGY_LOGIC'] = chebi['ONTOLOGY_LOGIC'].astype(str)
chebi = spark.createDataFrame(chebi)

  for column, series in pdf.iteritems():


In [11]:
dci = df.select(["drug_concept_id"]).distinct()
dci5 = dci.join(chebi, dci["drug_concept_id"] == chebi["CONCEPT_ID"], how = "inner")
print(dci5.count())
dci5.show()

618
+---------------+----------+------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|drug_concept_id|CONCEPT_ID|CONCEPT_CODE|        CONCEPT_NAME|ONTOLOGY_LOGIC|        ONTOLOGY_URI|      ONTOLOGY_LABEL|    MAPPING_CATEGORY|    MAPPING_EVIDENCE|
+---------------+----------+------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|         528323|    528323|    797752.0|Hepatitis B Surfa...|           nan|         CHEBI_60816|           immunogen|Manual One-to-One...|Source:DrugBank_D...|
|         529660|    529660|    798361.0|Hepatitis A Vacci...|           nan|         CHEBI_60816|           immunogen|Manual One-to-One...|Source:DrugBank_D...|
|         700299|    700299|     10502.0|        Thioridazine|            OR|CHEBI_48566 | CHE...|thioridazine hydr...|Automatic One-to-...|OBO_DbXref-OMOP_A...|
|         703547|    703

In [7]:
df.show()

+-------------+---------------+---------------+----------+-------------------+-----------------+
|  dose_era_id|drug_concept_id|unit_concept_id|dose_value|dose_era_start_date|dose_era_end_date|
+-------------+---------------+---------------+----------+-------------------+-----------------+
|1425929142274|        1549786|             -1|        -1|         25/02/1998|       26/03/1998|
| 764504178721|        1589505|             -1|        -1|         16/07/1999|       14/08/1999|
|1460288880666|        1549786|             -1|        -1|         04/11/1996|       03/12/1996|
| 841813590107|        1589505|             -1|        -1|         21/12/1999|       19/01/2000|
| 738734375019|        1549786|             -1|        -1|         21/12/1999|       19/01/2000|
| 137438953635|        1549786|             -1|        -1|         21/12/1994|       19/01/1995|
|1340029796492|        1589505|             -1|        -1|         25/01/1994|       23/02/1994|
|1571958030456|        1589505