In [2]:
import pandas as pd

In [9]:
df = pd.read_csv("dataset_cleaned.tsv", sep="\t")
df.head()

Unnamed: 0,eid,drug_concept_id,duration,atc_code,atc_level3
0,1000014,710062,36,N06AA09,N06A
1,1000014,721724,31,N06AA10,N06A
2,1000014,723013,18,N05BA01,N05B
3,1000014,836715,14,N05CD07,N05C
4,1000014,915981,30,"A01AB08,A07AA01,B05CA09,D06AX04,J01GB05,R02AB0...","A01A,A07A,B05C,D06A,J01G,R02A,S01A,S02A,S03A"


### cleaning

In [10]:
# patient-level outliers
# statistics of how many different kinds of drugs (drug_concept_id) each patient has
df.groupby("eid")["drug_concept_id"].nunique().describe()

count    254724.000000
mean         19.246208
std          16.263877
min           1.000000
25%           6.000000
50%          16.000000
75%          28.000000
max         175.000000
Name: drug_concept_id, dtype: float64

In [19]:
# exclude the patients who have less than 6 kinds of drugs or more than 51 kinds of drugs
patient_drug_counts = df.groupby("eid")["drug_concept_id"].nunique()
valid_patients = patient_drug_counts[
    (patient_drug_counts >= 6) & (patient_drug_counts <= 51)
].index
df = df[df["eid"].isin(valid_patients)]

In [20]:
len(df)

4003888

In [21]:
df.groupby("eid")["drug_concept_id"].nunique().describe()

count    183241.000000
mean         21.850394
std          11.304781
min           6.000000
25%          13.000000
50%          20.000000
75%          29.000000
max          51.000000
Name: drug_concept_id, dtype: float64

In [14]:
# statistics of duration
df["duration"].describe()

count    4.902471e+06
mean     3.406117e+02
std      8.318619e+02
min      1.000000e+00
25%      3.000000e+01
50%      4.200000e+01
75%      1.730000e+02
max      1.010700e+04
Name: duration, dtype: float64

In [22]:
# each drug is taken by how many patients
df.groupby("drug_concept_id")["eid"].nunique().describe()

count      1109.000000
mean       3610.358882
std       10940.378155
min           1.000000
25%           9.000000
50%         148.000000
75%        1465.000000
max      125375.000000
Name: eid, dtype: float64

In [51]:
# remove the drugs that are taken by less than 5% or more than 30% of the 254724 patients
drug_patient_counts = df.groupby("drug_concept_id")["eid"].nunique()
min_patients = int(0.01 * 254724)  # 5% threshold
max_patients = int(0.30 * 254724)  # 30% threshold
valid_drugs = drug_patient_counts[
    (drug_patient_counts >= min_patients) & (drug_patient_counts <= max_patients)
].index

In [52]:
len(valid_drugs)

216

In [53]:
cleaned_df = df[df["drug_concept_id"].isin(valid_drugs)]
cleaned_df.to_csv("dataset_cleaned.tsv", sep="\t", index=False)

In [64]:
# the number of patients and drugs after cleaning
print(len(cleaned_df["eid"].unique()))
print(len(cleaned_df["drug_concept_id"].unique()))

183239
216


### handling

In [87]:
df_cleaned = pd.read_csv("df_cleaned_1atc.tsv", sep="\t")
df_cleaned.head()

Unnamed: 0,eid,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code,duration,atc_level3
0,6021257,1236950609195,19005129,2014-05-12,2014-06-10,1,0,clobetasone,D07AB01,30,D07A
1,3430966,721554547993,19008994,2010-10-12,2010-11-07,1,0,mebeverine,A03AA04,27,A03A
2,2127268,910533073010,755695,2006-10-23,2007-10-04,6,23,fluoxetine,N06AB03,347,N06A
3,2441156,901943201395,1549080,2010-01-18,2010-04-17,1,0,"estrogens, conjugated (USP)",G03CA57,90,G03C
4,5489554,1649267519173,19011773,2011-03-28,2011-03-28,1,0,ascorbic acid,A11GA01,1,A11G
