In [None]:
import pyspark
import dxpy
import dxdata
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import countDistinct

# load the dataset

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [None]:
# Init pyspark and load dataframe
OMOP_TABLE = "omop_drug_era"  # 'omop_drug_era', 'omop_drug_exposure'

# Load dose era table
ode = dataset[OMOP_TABLE]

field_names = [f.name for f in ode.fields]

df = ode.retrieve_fields(names=field_names, engine=dxdata.connect())
# df = df.drop("eid")

print(f"Number of entries {df.count()}")
df.show(5)

In [None]:
drug_era_atc_pandas = pd.read_csv("drug_era_atc.tsv", sep="\t")
drug_era_atc_pandas.rename(columns={"_c13": "atc_code"}, inplace=True)

In [None]:
drug_era_atc_df = spark.createDataFrame(drug_era_atc_pandas)

df_with_atc = df.join(drug_era_atc_df, on="drug_concept_id", how="left")

# Step 4: Select relevant columns
df_with_atc = df_with_atc.select(
    df.columns + ["concept_name"] + ["atc_code"]
)  # Keep all original columns and add '_c13' (ATC code)
print(f"Number of entries {df_with_atc.count()}")
df_with_atc.show(5)

# filter through times

In [None]:
drug_times_df_pandas = pd.read_csv("drug_times_count.csv")
drug_times_df = spark.createDataFrame(drug_times_df_pandas)
drug_times_df.show(5)

In [None]:
df_combined = df_with_atc.join(drug_times_df, on="drug_concept_id", how="left")
filtered_df_1 = df_combined.filter(
    (df_combined["times_taken"] >= 313) & (df_combined["times_taken"] <= 4012)
)
filtered_df_1 = filtered_df_1.drop("times_taken")
print(f"Number of entries {filtered_df_1.count()}")
filtered_df_1.show(5)

In [None]:
unique_drug_concepts = filtered_df_1.select("drug_concept_id").distinct().count()
unique_drug_concepts

# filter through people

In [None]:
drug_people_df_pandas = pd.read_csv("drug_people_count.csv")
drug_people_df = spark.createDataFrame(drug_people_df_pandas)
drug_people_df.show(5)

In [None]:
df_combined_2 = filtered_df_1.join(drug_people_df, on="drug_concept_id", how="left")
filtered_df_2 = df_combined_2.filter(
    (df_combined_2["unique_people_count"] >= 121)
    & (df_combined_2["unique_people_count"] <= 1411)
)
filtered_df_2 = filtered_df_2.drop("unique_people_count")
print(f"Number of entries {filtered_df_2.count()}")
filtered_df_2.show(5)

In [None]:
unique_drug_concepts = filtered_df_2.select("drug_concept_id").distinct().count()
unique_drug_concepts

# filter through ATC Codes

In [None]:
atc_codes_of_interest = ["A10", "J01", "L01", "N05", "N06", "S01"]

filtered_df_3 = filtered_df_2.filter(
    filtered_df_2["atc_code"].rlike(
        "|".join([f"^{code}" for code in atc_codes_of_interest])
    )
)
print(f"Number of entries {filtered_df_3.count()}")
filtered_df_3.show(5)

In [None]:
unique_drug_concepts = filtered_df_3.select("drug_concept_id").distinct().count()
unique_drug_concepts

In [None]:
filtered_df_3_atc = filtered_df_3.select("drug_concept_id", "atc_code").dropDuplicates()
filtered_df_3_atc.show(5)

In [None]:
filtered_df_3_atc_pandas = filtered_df_3_atc.toPandas()

In [None]:
# Create a dictionary to store the sampled drug_concept_ids for each ATC code
sampled_drugs = {}

# Sample 3 drug_concept_ids for each ATC code
for atc_code in atc_codes_of_interest:
    # Filter the dataframe for the current ATC code
    filtered = filtered_df_3_atc_pandas[
        filtered_df_3_atc_pandas["atc_code"].str.startswith(atc_code)
    ]

    # Sample 3 drug_concept_ids (or all if less than 3 are available)
    sampled = filtered["drug_concept_id"].sample(
        n=min(3, len(filtered)), random_state=42
    )

    # Store the sampled drug_concept_ids in the dictionary
    sampled_drugs[atc_code] = sampled.tolist()

# Print the results
for atc_code, drugs in sampled_drugs.items():
    print(f"ATC code {atc_code}: {drugs}")

# Create a new dataframe with the sampled drug_concept_ids
the_18_sampled_drug_concept_id = pd.DataFrame(
    [(atc, drug) for atc, drugs in sampled_drugs.items() for drug in drugs],
    columns=["atc_code", "drug_concept_id"],
)

# Display the new dataframe
print("\nthe_18_sampled_drug_concept_id:")
print(the_18_sampled_drug_concept_id)

In [None]:
sampled_ids = the_18_sampled_drug_concept_id["drug_concept_id"].tolist()
filtered_df_4 = filtered_df_3.filter(filtered_df_3["drug_concept_id"].isin(sampled_ids))
print(f"Number of entries {filtered_df_4.count()}")
filtered_df_4.show(5)

In [None]:
unique_drug_concepts = filtered_df_4.select("drug_concept_id").distinct().count()
unique_drug_concepts

In [None]:
filtered_df_4_pandas = filtered_df_4.toPandas()
filtered_df_4_pandas.to_csv("filtered_df_4.csv", index=False)

# individual filtering

In [None]:
people_drug_variety_unsorted = filtered_df_4.groupBy("eid").agg(
    countDistinct("drug_concept_id").alias("distinct_drugs_taken")
)
people_drug_variety = people_drug_variety_unsorted.orderBy(
    "distinct_drugs_taken", ascending=False
)
people_drug_variety.show()

In [None]:
people_drug_variety_pandas = people_drug_variety.toPandas()
statistics_people_drug_variety = people_drug_variety_pandas[
    "distinct_drugs_taken"
].describe()
print(statistics_people_drug_variety)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(
    people_drug_variety_pandas["distinct_drugs_taken"],
    bins=200,
    color="blue",
    edgecolor="black",
)
plt.title("Distribution of Drug Types Taken by Individuals", fontsize=14)
plt.xlabel("Number of Drug Types per Individual", fontsize=12)
plt.ylabel("Frequency of Individuals", fontsize=12)
plt.grid(True)
plt.yscale("log")
plt.show()

# check on the focused dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
filtered_df_4 = pd.read_csv("filtered_df_4.csv")
filtered_df_4.head()

In [None]:
# Group by all columns except 'atc_code' and aggregate the ATC codes
filtered_df_4_grouped = (
    filtered_df_4.groupby([col for col in filtered_df_4.columns if col != "atc_code"])
    .agg({"atc_code": lambda x: ", ".join(sorted(set(x)))})
    .reset_index()
)

print("Shape before grouping:", filtered_df_4.shape)
print("Shape after grouping:", filtered_df_4_grouped.shape)

filtered_df_4_grouped.head()

In [None]:
# save the grouped dataframe to csv
filtered_df_4_grouped.to_csv("filtered_df_4_grouped.csv", index=False)

In [None]:
# Count occurrences of each drug
drug_counts = filtered_df_4_grouped["drug_concept_id"].value_counts().reset_index()
drug_counts.columns = ["drug_concept_id", "count"]

# Sort by count in descending order
drug_counts = drug_counts.sort_values("count", ascending=False)

# Add drug names and ATC codes
drug_info = filtered_df_4_grouped[
    ["drug_concept_id", "concept_name", "atc_code"]
].drop_duplicates()
drug_counts = drug_counts.merge(drug_info, on="drug_concept_id", how="left")

# Reorder columns
drug_counts = drug_counts[["drug_concept_id", "concept_name", "atc_code", "count"]]

print("Number of unique drugs:", len(drug_counts))
print(drug_counts)

In [None]:
# plot a histogram of the drug counts
plt.figure(figsize=(10, 6))
plt.hist(drug_counts["count"], bins=100, color="blue", edgecolor="black")
plt.title("Distribution of Drug Counts")
plt.xlabel("Number of Times a Drug is Taken")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# exclude the drug 785788 and 705755 because they are outliers
filtered_df_5 = filtered_df_4_grouped[
    ~filtered_df_4_grouped["drug_concept_id"].isin([785788, 705755])
]
filtered_df_5.head()

In [None]:
# save the filtered dataframe to csv
filtered_df_5.to_csv("filtered_df_5.csv", index=False)

In [None]:
# see how many rows are in the dataframe
print(f"Number of entries {filtered_df_5.count()}")

In [None]:
# Count occurrences of each drug
drug_counts = filtered_df_5["drug_concept_id"].value_counts().reset_index()
drug_counts.columns = ["drug_concept_id", "count"]

# Sort by count in descending order
drug_counts = drug_counts.sort_values("count", ascending=False)

# Add drug names and ATC codes
drug_info = filtered_df_5[
    ["drug_concept_id", "concept_name", "atc_code"]
].drop_duplicates()
drug_counts = drug_counts.merge(drug_info, on="drug_concept_id", how="left")

# Reorder columns
drug_counts = drug_counts[["drug_concept_id", "concept_name", "atc_code", "count"]]

print("Number of unique drugs:", len(drug_counts))
print(drug_counts)

In [None]:
print(drug_counts["count"].describe())

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(drug_counts["count"], bins=30, edgecolor="black")
plt.title("Frequency Distribution of Drug Intake Counts", fontsize=14)
plt.xlabel("Number of Intakes", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(True)
plt.show()

In [None]:
# number of unique people
unique_people = filtered_df_5["eid"].nunique()
print(f"Number of unique people: {unique_people}")

In [None]:
# distribution of how many people take the same drug
drug_people_counts = (
    filtered_df_5.groupby("drug_concept_id")["eid"].nunique().reset_index()
)
drug_people_counts.columns = ["drug_concept_id", "unique_people_count"]

# Add drug names and ATC codes
drug_info = filtered_df_5[
    ["drug_concept_id", "concept_name", "atc_code"]
].drop_duplicates()
drug_people_counts = drug_people_counts.merge(
    drug_info, on="drug_concept_id", how="left"
)

# Sort by unique_people_count in descending order
drug_people_counts = drug_people_counts.sort_values(
    "unique_people_count", ascending=False
)

# Reorder columns
drug_people_counts = drug_people_counts[
    ["drug_concept_id", "concept_name", "atc_code", "unique_people_count"]
]

print("Distribution of how many people take the same drug:")
print(drug_people_counts)

# Visualize the distribution
plt.figure(figsize=(12, 6))
plt.hist(drug_people_counts["unique_people_count"], bins=30, edgecolor="black")
plt.title("Distribution of Number of People Taking Each Drug", fontsize=14)
plt.xlabel("Number of People", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

# Print some statistics
print("\nStatistics:")
print(drug_people_counts["unique_people_count"].describe())

In [None]:
# Distribution of how many drugs each person takes
drug_people_counts = (
    filtered_df_5.groupby("eid")["drug_concept_id"].nunique().reset_index()
)
drug_people_counts.columns = ["eid", "unique_drugs_count"]

# Visualize the distribution
plt.figure(figsize=(12, 6))
plt.hist(drug_people_counts["unique_drugs_count"], bins=30, edgecolor="black")
plt.title("Distribution of Number of Unique Drugs Taken per Person", fontsize=14)
plt.xlabel("Number of Unique Drugs", fontsize=12)
plt.ylabel("Number of People", fontsize=12)
plt.yscale("log")
plt.grid(True, alpha=0.3)
plt.show()

# Print some statistics
print("\nStatistics:")
print(drug_people_counts["unique_drugs_count"].describe())

# Print the number of people taking each number of drugs
print("\nNumber of people taking each number of drugs:")
print(drug_people_counts["unique_drugs_count"].value_counts().sort_index())