In [5]:
import pandas as pd
import numpy as np

In [38]:
df = pd.read_csv("../../dataset/dataset.tsv", sep="\t")
# df.head()
display(df.loc[:, df.columns != "eid"].head())

Unnamed: 0,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days
0,1236950609195,19005129,12/05/2014,10/06/2014,1,0
1,798863919105,743670,30/03/2011,02/09/2011,5,12
2,721554547993,19008994,12/10/2010,07/11/2010,1,0
3,910533073010,755695,23/10/2006,04/10/2007,6,23
4,317827646206,19010400,30/07/1999,28/08/1999,1,0


In [33]:
def process_drug_switches(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process drug switches from the drug era dataset.

    Method:
        For each eid:
            for each drug era A, find a drug era B so that the start date of B is after and is the closest to the end date of A
            and then output a dataframe with the information of A and B

    Args:
        df (pd.DataFrame): Input dataframe containing drug era information
            Required columns: eid, drug_era_id, drug_concept_id, drug_era_start_date,
            drug_era_end_date, drug_exposure_count, gap_days

    Returns:
        pd.DataFrame: Processed dataframe containing drug switch information
    """
    # Ensure datetime format
    if df["drug_era_start_date"].dtype != "datetime64[ns]":
        df["drug_era_start_date"] = pd.to_datetime(
            df["drug_era_start_date"], format="%d/%m/%Y"
        )
    if df["drug_era_end_date"].dtype != "datetime64[ns]":
        df["drug_era_end_date"] = pd.to_datetime(
            df["drug_era_end_date"], format="%d/%m/%Y"
        )

    result_rows = []

    # Process each patient's drug eras
    for eid, patient_df in df.groupby("eid"):
        patient_df = patient_df.sort_values("drug_era_start_date").reset_index(
            drop=True
        )

        n_rows = len(patient_df)
        if n_rows < 2:  # Skip if patient has less than 2 drug eras
            continue

        # Create matrices for vectorized operations
        end_dates = patient_df["drug_era_end_date"].values[:, None]
        start_dates = patient_df["drug_era_start_date"].values

        time_diffs = start_dates - end_dates
        mask = (time_diffs > pd.Timedelta(0)) & (
            np.arange(n_rows)[:, None] < np.arange(n_rows)
        )

        # Process valid switches
        valid_indices = mask.any(axis=1)
        for idx in np.where(valid_indices)[0]:
            row_mask = mask[idx]
            if not row_mask.any():
                continue

            min_idx = np.where(row_mask)[0][np.argmin(time_diffs[idx][row_mask])]

            row_a = patient_df.iloc[idx]
            row_b = patient_df.iloc[min_idx]

            combined_row = {
                "eid": eid,
                "A_drug_era_id": row_a.drug_era_id,
                "A_drug_concept_id": row_a.drug_concept_id,
                "A_drug_era_start_date": row_a.drug_era_start_date,
                "A_drug_era_end_date": row_a.drug_era_end_date,
                "A_drug_exposure_count": row_a.drug_exposure_count,
                "A_gap_days": row_a.gap_days,
                "B_drug_era_id": row_b.drug_era_id,
                "B_drug_concept_id": row_b.drug_concept_id,
                "B_drug_era_start_date": row_b.drug_era_start_date,
                "B_drug_era_end_date": row_b.drug_era_end_date,
                "B_drug_exposure_count": row_b.drug_exposure_count,
                "B_gap_days": row_b.gap_days,
            }
            result_rows.append(combined_row)

    return pd.DataFrame(result_rows)

In [35]:
result_df = process_drug_switches(df)

In [41]:
# result_df.head()
display(result_df.loc[:, result_df.columns != "eid"].head(10))

Unnamed: 0,A_drug_era_id,A_drug_concept_id,A_drug_era_start_date,A_drug_era_end_date,A_drug_exposure_count,A_gap_days,B_drug_era_id,B_drug_concept_id,B_drug_era_start_date,B_drug_era_end_date,B_drug_exposure_count,B_gap_days
0,1365799655232,1124300,2008-04-29,2008-06-09,1,0,240518240204,923645,2008-09-23,2008-10-20,1,0
1,240518240204,923645,2008-09-23,2008-10-20,1,0,532576013874,1769535,2009-04-15,2009-04-19,1,0
2,171798743567,1178663,2008-09-23,2008-10-13,1,0,532576013874,1769535,2009-04-15,2009-04-19,1,0
3,369367235575,836715,2008-09-23,2008-10-06,1,0,532576013874,1769535,2009-04-15,2009-04-19,1,0
4,1494648709321,1103314,2008-09-23,2008-10-09,1,0,532576013874,1769535,2009-04-15,2009-04-19,1,0
5,532576013874,1769535,2009-04-15,2009-04-19,1,0,17179963794,1746940,2009-07-06,2009-08-04,1,0
6,687194806236,1746940,2009-04-17,2009-05-16,1,0,17179963794,1746940,2009-07-06,2009-08-04,1,0
7,17179963794,1746940,2009-07-06,2009-08-04,1,0,798863950808,1178663,2009-11-13,2009-12-03,1,0
8,1632087663179,941258,2009-07-09,2009-08-07,1,0,798863950808,1178663,2009-11-13,2009-12-03,1,0
9,798863950808,1178663,2009-11-13,2009-12-03,1,0,1640677543345,1746940,2010-02-04,2010-03-05,1,0


In [32]:
# not included in the result:
# patients who  have only one drug era or
# drug eras that don't have a subsequent drug era

print(len(result_df))
print(len(df))
print(len(df["eid"].unique()))
print(len(result_df["eid"].unique()))

19030085
19959413
281690
262036


In [14]:
# how many switches are between the same drug
same_drug_switches = result_df[
    result_df["A_drug_concept_id"] == result_df["B_drug_concept_id"]
]
print(f"Number of switches between same drug: {len(same_drug_switches)}")
print(
    f"Percentage of total switches: {len(same_drug_switches) / len(result_df) * 100:.2f}%"
)

Number of switches between same drug: 3473775
Percentage of total switches: 18.25%
