In [1]:
import pyspark
import dxpy
import dxdata
import pandas as pd
from pyspark.sql.functions import col, to_date

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [26]:
# Init pyspark and load dataframe
# Load the 'omop_drug_era' 'omop_drug_exposure' and 'omop_person' tables
omop_drug_era = "omop_drug_era"
omop_drug_exposure = "omop_drug_exposure"
omop_person = "omop_person"
# Load tables
drug_era = dataset[omop_drug_era]
drug_exposure = dataset[omop_drug_exposure]
person = dataset[omop_person]

# Extract all field names from the tables
field_names_era = [f.name for f in drug_era.fields]
field_names_exposure = [f.name for f in drug_exposure.fields]
field_names_person = [f.name for f in person.fields]

# Retrieve the fields for both tables
df_era = drug_era.retrieve_fields(names=field_names_era, engine=dxdata.connect())
df_exposure = drug_exposure.retrieve_fields(
    names=field_names_exposure, engine=dxdata.connect()
)
df_person = person.retrieve_fields(names=field_names_person, engine=dxdata.connect())

# df = df.drop("eid")
print(df_era.columns)
print(df_exposure.columns)
print(df_person.columns)

print(f"Number of entries era {df_era.count()}")
print(f"Number of entries exposure {df_exposure.count()}")
print(f"Number of entries person {df_person.count()}")
df_era.drop("eid").show(3)
df_exposure.drop("eid").show(3)
df_person.drop("eid").show(3)

['eid', 'drug_era_id', 'drug_concept_id', 'drug_era_start_date', 'drug_era_end_date', 'drug_exposure_count', 'gap_days']
['eid', 'drug_exposure_id', 'drug_concept_id', 'drug_exposure_start_date', 'drug_exposure_start_datetime', 'drug_exposure_end_date', 'drug_exposure_end_datetime', 'verbatim_end_date', 'drug_type_concept_id', 'stop_reason', 'refills', 'quantity', 'days_supply', 'sig', 'route_concept_id', 'lot_number', 'provider_id', 'visit_occurrence_id', 'visit_detail_id', 'drug_source_value', 'drug_source_concept_id', 'route_source_value', 'dose_unit_source_value']
['eid', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']
Number of entries era 19959413
Number of entries exposure 55

In [27]:
# Load the `filtered_drug_atc.tsv` file as a Pandas DataFrame
drug_mapping_df = pd.read_csv("filtered_drug_atc.tsv", sep="\t")
# Extract `drug_concept_id` values as a list
drug_concept_ids = drug_mapping_df["drug_concept_id"].dropna().unique().tolist()
drug_concept_ids

[1154029, 1103640, 991876, 1110410]

In [28]:
# Filter the OMOP dataset for relevant drug_concept_id
filtered_df_era = df_era.filter(col("drug_concept_id").isin(drug_concept_ids))
# Format date columns
filtered_df_era = filtered_df_era.withColumn(
    "drug_era_start_date", to_date(col("drug_era_start_date"), "MM/dd/yyyy")
).withColumn("drug_era_end_date", to_date(col("drug_era_end_date"), "MM/dd/yyyy"))

# Merge with person_df on the 'eid' column to keep information about the person
# Select only the required columns from df_person
df_person_selected = df_person.select("eid", "gender_concept_id", "year_of_birth")

# Perform a left join with filtered_df_era on the 'eid' column
filtered_df_era = filtered_df_era.join(
    df_person_selected,
    on="eid",
    how="left",  # Perform a left join
)
# Show the result of the filter
filtered_df_era.drop("eid").show()
filtered_df_era.count()

+-------------+---------------+-------------------+-----------------+-------------------+--------+-----------------+-------------+
|  drug_era_id|drug_concept_id|drug_era_start_date|drug_era_end_date|drug_exposure_count|gap_days|gender_concept_id|year_of_birth|
+-------------+---------------+-------------------+-----------------+-------------------+--------+-----------------+-------------+
| 730144538282|        1110410|               null|             null|                  2|       0|             8532|         1951|
|1176821107656|         991876|               null|       2014-08-08|                  1|       0|             8532|         1949|
|  17179961333|         991876|               null|             null|                  1|       0|             8507|         1954|
|1400159383219|         991876|         2011-03-06|             null|                  2|       0|             8532|         1959|
|1700807107514|         991876|         2007-07-11|       2007-12-11|              

72395

In [29]:
# Filter the OMOP drug exposure dataset for relevant drug_concept_id
filtered_df_exposure = df_exposure.filter(col("drug_concept_id").isin(drug_concept_ids))
# Define the relevant columns for EDA
relevant_columns = [
    "eid",
    "drug_concept_id",
    "drug_exposure_start_date",
    "drug_exposure_end_date",
    "stop_reason",
    "refills",
    "days_supply",  # you can keep this to analyze duration or adherence
    "quantity",  # optional, can help to understand quantity per exposure
]

# Filter out the columns that are relevant for the EDA
filtered_df_exposure = filtered_df_exposure.select(relevant_columns)

# Format the date columns (if necessary)
filtered_df_exposure = filtered_df_exposure.withColumn(
    "drug_exposure_start_date", to_date(col("drug_exposure_start_date"), "MM/dd/yyyy")
).withColumn(
    "drug_exposure_end_date", to_date(col("drug_exposure_end_date"), "MM/dd/yyyy")
)

# Merge with person_df on the 'eid' column to keep information about the person
# Perform a left join with filtered_df_exposure on the 'eid' column
filtered_df_exposure = filtered_df_exposure.join(
    df_person_selected,
    on="eid",
    how="left",  # Perform a left join
)
# Show the result of the filter
filtered_df_exposure.drop("eid").show()
filtered_df_exposure.count()

+---------------+------------------------+----------------------+-----------+-------+-----------+--------+-----------------+-------------+
|drug_concept_id|drug_exposure_start_date|drug_exposure_end_date|stop_reason|refills|days_supply|quantity|gender_concept_id|year_of_birth|
+---------------+------------------------+----------------------+-----------+-------+-----------+--------+-----------------+-------------+
|        1110410|                    null|                  null|       null|   null|       null|       1|             8507|         1966|
|        1110410|                    null|                  null|       null|   null|       null|       1|             8507|         1966|
|        1110410|              2014-08-04|            2014-07-05|       null|   null|       null|    null|             8532|         1949|
|        1110410|              2002-05-09|            2002-04-10|       null|   null|       null|    null|             8532|         1946|
|        1110410|          

22

In [33]:
# Convert the filtered Spark DataFrame to a Pandas DataFrame
pandas_df_era = filtered_df_era.toPandas()
pandas_df_era.loc[:, "drug_concept_id"] = pandas_df_era.loc[
    :, "drug_concept_id"
].astype(int)
# Merge pandas_df_era with drug_mapping_df on `drug_concept_id`
pandas_df_era = pandas_df_era.merge(
    drug_mapping_df[["drug_concept_id", "concept_name"]],  # Only keep necessary columns
    on="drug_concept_id",
    how="left",
)
# Rename the 'concept_name' column to 'drug'
pandas_df_era.rename(columns={"concept_name": "drug"}, inplace=True)
# Read the omop_concept.tsv table
omop_concept_df = pd.read_csv("omop_concept.tsv", sep="\t")
# Ensure both columns have the same data type
pandas_df_era.loc[:, "gender_concept_id"] = pandas_df_era.loc[
    :, "gender_concept_id"
].astype(int)

# Merge dataframes on the appropriate columns
pandas_df_era = pandas_df_era.merge(
    omop_concept_df[["concept_id", "concept_name"]],
    left_on="gender_concept_id",
    right_on="concept_id",
    how="left",
)
# Rename the 'concept_name' column to 'gender'
pandas_df_era.rename(columns={"concept_name": "gender"}, inplace=True)
# Drop the extra 'concept_id' column, if not needed
pandas_df_era.drop(columns="concept_id", inplace=True)

# Save the Pandas DataFrame to a CSV file
pandas_df_era.to_csv("filtered_data_era.csv", index=False)

  pandas_df_era.loc[:, "drug_concept_id"] = pandas_df_era.loc[
  omop_concept_df = pd.read_csv('omop_concept.tsv', sep='\t')
  pandas_df_era.loc[:, "gender_concept_id"] = pandas_df_era.loc[


In [34]:
pandas_df_exposure = filtered_df_exposure.toPandas()
pandas_df_exposure.loc[:, "drug_concept_id"] = pandas_df_exposure.loc[
    :, "drug_concept_id"
].astype(int)
pandas_df_exposure = pandas_df_exposure.merge(
    drug_mapping_df[["drug_concept_id", "concept_name"]],  # Only keep necessary columns
    on="drug_concept_id",
    how="left",
)
# Rename the 'concept_name' column to 'drug'
pandas_df_exposure.rename(columns={"concept_name": "drug"}, inplace=True)
# Ensure both columns have the same data type
pandas_df_exposure.loc[:, "gender_concept_id"] = pandas_df_exposure.loc[
    :, "gender_concept_id"
].astype(int)

# Merge dataframes on the appropriate columns
pandas_df_exposure = pandas_df_exposure.merge(
    omop_concept_df[["concept_id", "concept_name"]],
    left_on="gender_concept_id",
    right_on="concept_id",
    how="left",
)
# Rename the 'concept_name' column to 'gender'
pandas_df_exposure.rename(columns={"concept_name": "gender"}, inplace=True)
# Drop the extra 'concept_id' column, if not needed
pandas_df_exposure.drop(columns="concept_id", inplace=True)

# Save the Pandas DataFrame to a CSV file
pandas_df_exposure.to_csv("filtered_data_exposure.csv", index=False)
print(pandas_df_era.drop("eid", axis=1))
print(pandas_df_exposure.drop("eid", axis=1))

  pandas_df_exposure.loc[:, "drug_concept_id"] = pandas_df_exposure.loc[
  pandas_df_exposure.loc[:, "gender_concept_id"] = pandas_df_exposure.loc[


         drug_era_id  drug_concept_id drug_era_start_date drug_era_end_date  \
0       730144538282          1110410                None              None   
1      1176821107656           991876                None        2014-08-08   
2        17179961333           991876                None              None   
3      1400159383219           991876          2011-03-06              None   
4      1700807107514           991876          2007-07-11        2007-12-11   
...              ...              ...                 ...               ...   
75517   601295453580           991876                None        2010-12-01   
75518   618475340648           991876                None              None   
75519  1554778227782           991876                None              None   
75520  1571958032822           991876          2002-05-12        2003-10-06   
75521   438086665084          1110410          2015-03-02        2015-05-12   

      drug_exposure_count gap_days  gender_concept_