In [1]:
import pyspark
import dxpy
import dxdata
import pandas as pd
from pyspark.sql.functions import col, to_date

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [3]:
# Init pyspark and load dataframe
# Load the `omop_drug_era` and `omop_drug_exposure` tables
OMOP_TABLE_ERA = "omop_drug_era"
OMOP_TABLE_EXPOSURE = "omop_drug_exposure"

# Load both tables
ode_era = dataset[OMOP_TABLE_ERA]
ode_exposure = dataset[OMOP_TABLE_EXPOSURE]

# Extract all field names from both tables
field_names_era = [f.name for f in ode_era.fields]
field_names_exposure = [f.name for f in ode_exposure.fields]

# Retrieve the fields for both tables
df_era = ode_era.retrieve_fields(names=field_names_era, engine=dxdata.connect())
df_exposure = ode_exposure.retrieve_fields(
    names=field_names_exposure, engine=dxdata.connect()
)

# df = df.drop("eid")
print(df_era.columns)
print(df_exposure.columns)
print(f"Number of entries era {df_era.count()}")
print(f"Number of entries exposure {df_exposure.count()}")
df_era.drop("eid").show(3)
df_exposure.drop("eid").show(3)

['eid', 'drug_era_id', 'drug_concept_id', 'drug_era_start_date', 'drug_era_end_date', 'drug_exposure_count', 'gap_days']
['eid', 'drug_exposure_id', 'drug_concept_id', 'drug_exposure_start_date', 'drug_exposure_start_datetime', 'drug_exposure_end_date', 'drug_exposure_end_datetime', 'verbatim_end_date', 'drug_type_concept_id', 'stop_reason', 'refills', 'quantity', 'days_supply', 'sig', 'route_concept_id', 'lot_number', 'provider_id', 'visit_occurrence_id', 'visit_detail_id', 'drug_source_value', 'drug_source_concept_id', 'route_source_value', 'dose_unit_source_value']
Number of entries era 19959413
Number of entries exposure 55091531
+-------------+---------------+-------------------+-----------------+-------------------+--------+
|  drug_era_id|drug_concept_id|drug_era_start_date|drug_era_end_date|drug_exposure_count|gap_days|
+-------------+---------------+-------------------+-----------------+-------------------+--------+
|1236950609195|       19005129|         12/05/2014|       10/

In [5]:
# Load the `filtered_drug_atc.tsv` file as a Pandas DataFrame
drug_mapping_df = pd.read_csv("filtered_drug_atc.tsv", sep="\t")
# Extract `drug_concept_id` values as a list
drug_concept_ids = drug_mapping_df["drug_concept_id"].dropna().unique().tolist()
drug_concept_ids

[1154029, 1103640, 991876, 1110410]

In [6]:
# Filter the OMOP dataset for relevant drug_concept_id
filtered_df_era = df_era.filter(col("drug_concept_id").isin(drug_concept_ids))
# Format date columns
filtered_df_era = filtered_df_era.withColumn(
    "drug_era_start_date", to_date(col("drug_era_start_date"), "MM/dd/yyyy")
).withColumn("drug_era_end_date", to_date(col("drug_era_end_date"), "MM/dd/yyyy"))

# Show the result of the filter
filtered_df_era.drop("eid").show()
filtered_df_era.count()

+-------------+---------------+-------------------+-----------------+-------------------+--------+
|  drug_era_id|drug_concept_id|drug_era_start_date|drug_era_end_date|drug_exposure_count|gap_days|
+-------------+---------------+-------------------+-----------------+-------------------+--------+
| 730144538282|        1110410|               null|             null|                  2|       0|
|1176821107656|         991876|               null|       2014-08-08|                  1|       0|
|  17179961333|         991876|               null|             null|                  1|       0|
|1400159383219|         991876|         2011-03-06|             null|                  2|       0|
|1700807107514|         991876|         2007-07-11|       2007-12-11|                  1|       0|
|1065151963004|         991876|         2015-11-03|             null|                  1|       0|
|1597727861389|        1110410|               null|             null|                  1|       0|
|163208757

72395

In [7]:
# Filter the OMOP drug exposure dataset for relevant drug_concept_id
filtered_df_exposure = df_exposure.filter(col("drug_concept_id").isin(drug_concept_ids))
# Define the relevant columns for EDA
relevant_columns = [
    "drug_concept_id",
    "drug_exposure_start_date",
    "drug_exposure_end_date",
    "stop_reason",
    "refills",
    "days_supply",  # you can keep this to analyze duration or adherence
    "quantity",  # optional, can help to understand quantity per exposure
]

# Filter out the columns that are relevant for the EDA
filtered_df_exposure = filtered_df_exposure.select(relevant_columns)

# Format the date columns (if necessary)
filtered_df_exposure = filtered_df_exposure.withColumn(
    "drug_exposure_start_date", to_date(col("drug_exposure_start_date"), "MM/dd/yyyy")
).withColumn(
    "drug_exposure_end_date", to_date(col("drug_exposure_end_date"), "MM/dd/yyyy")
)
# Show the result of the filter
filtered_df_exposure.drop("eid").show()
filtered_df_exposure.count()

+---------------+------------------------+----------------------+-----------+-------+-----------+--------+
|drug_concept_id|drug_exposure_start_date|drug_exposure_end_date|stop_reason|refills|days_supply|quantity|
+---------------+------------------------+----------------------+-----------+-------+-----------+--------+
|        1110410|                    null|                  null|       null|   null|       null|       1|
|        1110410|                    null|                  null|       null|   null|       null|       1|
|        1110410|              2014-08-04|            2014-07-05|       null|   null|       null|    null|
|        1110410|              2002-05-09|            2002-04-10|       null|   null|       null|    null|
|        1110410|                    null|                  null|       null|   null|       null|       1|
|        1110410|                    null|                  null|       null|   null|       null|       1|
|         991876|                    

22

In [16]:
# Convert the filtered Spark DataFrame to a Pandas DataFrame
pandas_df_era = filtered_df_era.toPandas()
pandas_df_era.loc[:, "drug_concept_id"] = pandas_df_era.loc[
    :, "drug_concept_id"
].astype(int)
# Merge pandas_df_era with drug_mapping_df on `drug_concept_id`
pandas_df_era = pandas_df_era.merge(
    drug_mapping_df[["drug_concept_id", "concept_name"]],  # Only keep necessary columns
    on="drug_concept_id",
    how="left",
)
# Save the Pandas DataFrame to a CSV file
pandas_df_era.to_csv("filtered_data_era.csv", index=False)

pandas_df_exposure = filtered_df_exposure.toPandas()
pandas_df_exposure.loc[:, "drug_concept_id"] = pandas_df_exposure.loc[
    :, "drug_concept_id"
].astype(int)
pandas_df_exposure = pandas_df_exposure.merge(
    drug_mapping_df[["drug_concept_id", "concept_name"]],  # Only keep necessary columns
    on="drug_concept_id",
    how="left",
)
# Save the Pandas DataFrame to a CSV file
pandas_df_exposure.to_csv("filtered_data_exposure.csv", index=False)
print(pandas_df_era.drop("eid", axis=1))
print(pandas_df_exposure)

  pandas_df_era.loc[:, "drug_concept_id"] = pandas_df_era.loc[


         drug_era_id  drug_concept_id drug_era_start_date drug_era_end_date  \
0       730144538282          1110410                None              None   
1      1176821107656           991876                None        2014-08-08   
2        17179961333           991876                None              None   
3      1400159383219           991876          2011-03-06              None   
4      1700807107514           991876          2007-07-11        2007-12-11   
...              ...              ...                 ...               ...   
75517   601295453580           991876                None        2010-12-01   
75518   618475340648           991876                None              None   
75519  1554778227782           991876                None              None   
75520  1571958032822           991876          2002-05-12        2003-10-06   
75521   438086665084          1110410          2015-03-02        2015-05-12   

      drug_exposure_count gap_days concept_name  
0

  pandas_df_exposure.loc[:, "drug_concept_id"] = pandas_df_exposure.loc[
