In [1]:
import pandas as pd

### atc code handling

In [26]:
drug_era_atc = pd.read_csv("../../mappings/drug_era/drug_era_atc.tsv", sep="\t")

In [18]:
# sort by drug_concept_id
drug_era_atc = drug_era_atc.sort_values(by="drug_concept_id")
# drug_era_atc.iloc[1060:1070]
# it seems that similar drug_concept_ids do not necessarily correspond to drug similarity

In [27]:
# rename the c_13 column to atc_code
drug_era_atc.rename(columns={"_c13": "atc_code"}, inplace=True)
# keep only the columns: drug_concept_id, concept_name, atc_code
drug_era_atc = drug_era_atc[["drug_concept_id", "concept_name", "atc_code"]]
# add this row: concept_id: 1501700, concept_name: levothyroxine, atc_code: H03AA01
drug_era_atc = pd.concat(
    [
        drug_era_atc,
        pd.DataFrame(
            {
                "drug_concept_id": [1501700],
                "concept_name": ["levothyroxine"],
                "atc_code": ["H03AA01"],
            }
        ),
    ],
    ignore_index=True,
)

In [2]:
drug_era_atc = pd.read_csv("../../dataset/atc_codes_simplified.tsv", sep="\t")

In [3]:
drug_era_atc.head(10)

Unnamed: 0,drug_concept_id,concept_name,atc_code
0,1319998,acebutolol,C07AB04
1,19024063,acenocoumarol,B01AA07
2,929435,acetazolamide,S01EC01
3,929549,acetic acid,G01AD02
4,929549,acetic acid,S02AA10
5,1701928,activated charcoal,A07BA01
6,1541079,corticotropin,H01AA01
7,1154343,albuterol,R03AC02
8,1154343,albuterol,R03CC02
9,955372,ethanol,D08AX08


In [4]:
len(drug_era_atc)

1618

In [5]:
# the number of unique drug_concept_ids
len(drug_era_atc["drug_concept_id"].unique())

1142

In [6]:
# combine the atc codes for the same drug_concept_id with a comma and drop duplicates
drug_era_atc["atc_code"] = drug_era_atc.groupby("drug_concept_id")[
    "atc_code"
].transform(
    lambda x: ",".join(
        sorted(set(x))
    )  # Added set() to remove duplicates before joining
)
drug_era_atc = drug_era_atc.drop_duplicates()

In [7]:
len(drug_era_atc)

1142

In [8]:
drug_era_atc.to_csv("../../dataset/atc_codes_combined.tsv", sep="\t", index=False)

### map atc codes to the dataset

In [2]:
drug_era_atc = pd.read_csv("../../dataset/atc_codes_combined.tsv", sep="\t")

In [9]:
df = pd.read_csv("../../dataset/dataset.tsv", sep="\t")

In [5]:
df.head()

Unnamed: 0,eid,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days
0,6021257,1236950609195,19005129,12/05/2014,10/06/2014,1,0
1,3832658,798863919105,743670,30/03/2011,02/09/2011,5,12
2,3430966,721554547993,19008994,12/10/2010,07/11/2010,1,0
3,2127268,910533073010,755695,23/10/2006,04/10/2007,6,23
4,5185705,317827646206,19010400,30/07/1999,28/08/1999,1,0


In [10]:
len(df)

19959413

In [11]:
# map the drug_concept_id to atc_code.
df = df.merge(drug_era_atc, on="drug_concept_id", how="left")
# drop the rows where atc_code is empty
df = df[df["atc_code"].notna()]

In [9]:
len(df)

17609376

In [13]:
df.to_csv("../../dataset/dataset_with_atc_codes.tsv", sep="\t", index=False)

### handling the drug era duration

In [27]:
df.head()

Unnamed: 0,eid,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code,duration,atc_level3
0,6021257,1236950609195,19005129,2014-05-12,2014-06-10,1,0,clobetasone,"D07AB01,S01BA09",30,"D07A,S01B"
1,3832658,798863919105,743670,2011-03-30,2011-09-02,5,12,venlafaxine,N06AX16,157,N06A
2,3430966,721554547993,19008994,2010-10-12,2010-11-07,1,0,mebeverine,A03AA04,27,A03A
3,2127268,910533073010,755695,2006-10-23,2007-10-04,6,23,fluoxetine,N06AB03,347,N06A
5,2441156,901943201395,1549080,2010-01-18,2010-04-17,1,0,"estrogens, conjugated (USP)",G03CA57,90,G03C


In [28]:
# calculate the duration of each drug era with explicit date format (dd/mm/yyyy)
df["drug_era_start_date"] = pd.to_datetime(df["drug_era_start_date"], format="%d/%m/%Y")
df["drug_era_end_date"] = pd.to_datetime(df["drug_era_end_date"], format="%d/%m/%Y")
df["duration"] = (df["drug_era_end_date"] - df["drug_era_start_date"]).dt.days + 1
df.head()

Unnamed: 0,eid,drug_era_id,drug_concept_id,drug_era_start_date,drug_era_end_date,drug_exposure_count,gap_days,concept_name,atc_code,duration,atc_level3
0,6021257,1236950609195,19005129,2014-05-12,2014-06-10,1,0,clobetasone,"D07AB01,S01BA09",30,"D07A,S01B"
1,3832658,798863919105,743670,2011-03-30,2011-09-02,5,12,venlafaxine,N06AX16,157,N06A
2,3430966,721554547993,19008994,2010-10-12,2010-11-07,1,0,mebeverine,A03AA04,27,A03A
3,2127268,910533073010,755695,2006-10-23,2007-10-04,6,23,fluoxetine,N06AB03,347,N06A
5,2441156,901943201395,1549080,2010-01-18,2010-04-17,1,0,"estrogens, conjugated (USP)",G03CA57,90,G03C


In [91]:
len(df)

17609376

In [94]:
df = pd.read_csv("../../dataset/dataset_with_atc_duration_level3.tsv", sep="\t")

In [97]:
# sum the duration for each drug_concept_id taken by each eid
# first, only keep the columns: eid, drug_concept_id, duration, atc_code, atc_level3
df = df[["eid", "drug_concept_id", "duration", "atc_code", "atc_level3"]]
# then, group by eid and drug_concept_id, and sum the duration for each group with the same eid and drug_concept_id
df = (
    df.groupby(["eid", "drug_concept_id"])
    .agg(
        {
            "duration": "sum",
            "atc_code": "first",  # Keep the first atc_code
            "atc_level3": "first",  # Keep the first atc_level3
        }
    )
    .reset_index()
)

In [104]:
df.head()
len(df)
df.to_csv(
    "../../dataset/dataset_with_atc_duration_level3_summed.tsv", sep="\t", index=False
)

### clustering
perform clustering for people based on what drugs they take (only consider the 3rd level atc codes (the first 4 digits of the atc code), i.e. if the drugs are in the same 3rd level atc code group, they are considered the same drug) and how long they take the drugs. \
for drugs that have multiple atc codes, we use equal distribution assumption, which is to distribute the drug duration evenly across all the atc codes.

components in the dataset: \
eid: unique identifier for each person\
drug_concept_id: unique identifier for each drug \
drug_era_id: unique identifier for each drug era. each drug era is a continuous period of drug use. each person can take the same drug in multiple drug eras. \
drug_era_start_date: start date of the drug era \
drug_era_end_date: end date of the drug era \
the duration of the drug era: calculated as drug_era_end_date - drug_era_start_date + 1 day


In [24]:
# Extract the 3rd level ATC codes (first 4 characters) and remove duplicates
df["atc_level3"] = (
    df["atc_code"]
    .str.split(",")
    .apply(
        lambda x: ",".join(
            sorted(set(code[:4] for code in x))
        )  # Added set() to remove duplicates
    )
)

In [105]:
df.head()

Unnamed: 0,eid,drug_concept_id,duration,atc_code,atc_level3
0,1000014,710062,36,N06AA09,N06A
1,1000014,721724,31,N06AA10,N06A
2,1000014,723013,18,N05BA01,N05B
3,1000014,836715,14,N05CD07,N05C
4,1000014,915981,30,"A01AB08,A07AA01,B05CA09,D06AX04,J01GB05,R02AB0...","A01A,A07A,B05C,D06A,J01G,R02A,S01A,S02A,S03A"


In [106]:
# For drugs with multiple ATC codes, split them and distribute duration evenly
def distribute_duration(row):
    atc_codes = row["atc_level3"].split(",")
    duration_per_code = row["duration"] / len(atc_codes)
    return [(code.strip(), duration_per_code) for code in atc_codes]

In [107]:
# Create expanded dataframe with distributed durations
expanded_records = []
for _, row in df.iterrows():
    for atc_code, duration in distribute_duration(row):
        expanded_records.append(
            {"eid": row["eid"], "atc_level3": atc_code, "duration": duration}
        )
expanded_df = pd.DataFrame(expanded_records)

In [108]:
expanded_df.head(10)

Unnamed: 0,eid,atc_level3,duration
0,1000014,N06A,36.0
1,1000014,N06A,31.0
2,1000014,N05B,18.0
3,1000014,N05C,14.0
4,1000014,A01A,3.333333
5,1000014,A07A,3.333333
6,1000014,B05C,3.333333
7,1000014,D06A,3.333333
8,1000014,J01G,3.333333
9,1000014,R02A,3.333333


In [115]:
len(expanded_df)

11962244

In [116]:
# Create pivot table: rows are patients, columns are ATC codes, values are total durations
patient_features = expanded_df.pivot_table(
    index="eid", columns="atc_level3", values="duration", aggfunc="sum", fill_value=0
)

In [117]:
patient_features.head(5)

atc_level3,A01A,A02A,A02B,A03A,A03B,A03F,A04A,A06A,A07A,A07B,...,S02A,S02B,S02D,S03A,S03B,V03A,V04C,V06D,V08C,V09X
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000014,7.060606,0.0,407.0,71.0,10.0,0.0,30.0,0.0,3.333333,0.0,...,18.333333,7.727273,0.0,3.333333,7.727273,0.0,0.0,0.0,0.0,0.0
1000023,12.166667,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,6.0,6.666667,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
1000041,3.333333,0.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000062,73.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1000077,0.0,0.0,304.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
# statistics of flattened patient_features (patient_features is a dataframe, we need to flatten it to a series first. exclude zero values)
summary = pd.Series(
    patient_features.values.flatten()[patient_features.values.flatten() != 0]
).describe()
summary

count    7.233720e+06
mean     2.308410e+02
std      7.590149e+02
min      9.090909e-02
25%      7.083333e+00
50%      2.066667e+01
75%      7.366667e+01
max      3.370950e+04
dtype: float64

In [119]:
# Normalize the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
patient_features_scaled = scaler.fit_transform(patient_features)

In [120]:
patient_features_scaled.shape

(254724, 191)

In [121]:
patient_features_scaled

array([[-0.31084985, -0.20899395, -0.04604871, ..., -0.0386851 ,
        -0.00312108, -0.00198137],
       [-0.28308036, -0.20899395, -0.41066188, ..., -0.0386851 ,
        -0.00312108, -0.00198137],
       [-0.33112076, -0.20899395, -0.36049398, ..., -0.0386851 ,
        -0.00312108, -0.00198137],
       ...,
       [-0.34924922, -0.20899395, -0.41066188, ..., -0.0386851 ,
        -0.00312108, -0.00198137],
       [-0.34924922, -0.20899395, -0.41066188, ..., -0.0386851 ,
        -0.00312108, -0.00198137],
       [-0.34924922, -0.20899395, -0.41066188, ..., -0.0386851 ,
        -0.00312108, -0.00198137]])

In [122]:
# statistics of patient_features_scaled.flatten()
summary = pd.Series(patient_features_scaled.flatten()).describe()
summary

count    4.865228e+07
mean    -1.696460e-18
std      1.000000e+00
min     -4.733108e-01
25%     -1.638423e-01
50%     -7.188020e-02
75%     -1.642836e-02
max      5.047009e+02
dtype: float64

In [123]:
# Perform K-means clustering
from sklearn.cluster import KMeans

n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(patient_features_scaled)

In [124]:
# Add cluster labels to a new dataframe
clustering_results = pd.DataFrame({"eid": patient_features.index, "cluster": clusters})

In [126]:
clustering_results.head(10)

Unnamed: 0,eid,cluster
0,1000014,0
1,1000023,0
2,1000041,0
3,1000062,0
4,1000077,0
5,1000086,0
6,1000095,0
7,1000113,0
8,1000181,0
9,1000197,0


In [127]:
# Basic analysis of clusters
print("\nCluster sizes:")
print(clustering_results["cluster"].value_counts())


Cluster sizes:
cluster
0     192420
1      28382
9       8565
12      8146
16      5546
15      5288
19      2578
8       1264
5        647
10       440
3        429
11       337
2        207
4        186
14       146
7        103
18        35
6          3
17         1
13         1
Name: count, dtype: int64


In [66]:
# Analyze characteristic drugs for each cluster
cluster_centers = pd.DataFrame(
    kmeans.cluster_centers_, columns=patient_features.columns
)

# For each cluster, find the top 5 most characteristic ATC codes
for i in range(n_clusters):
    print(f"\nTop 5 characteristic drugs for Cluster {i}:")
    top_atc = cluster_centers.iloc[i].sort_values(ascending=False).head(5)
    print(top_atc)

In [159]:
cleaned_df = pd.read_csv("../../dataset/df_cleaned_1atc_sumduration.tsv", sep="\t")

In [160]:
cleaned_df.head(10)

Unnamed: 0,eid,drug_concept_id,duration,atc_code,atc_level3
0,1000014,710062,36,N06AA09,N06A
1,1000014,723013,18,N05BA01,N05B
2,1000014,836715,14,N05CD07,N05C
3,1000014,915981,30,A01AB08,A01A
4,1000014,920293,18,J01XE01,J01X
5,1000014,920458,55,A07EA04,A07E
6,1000014,923645,407,A02BC01,A02B
7,1000014,929549,30,G01AD02,G01A
8,1000014,1103314,64,N02AX02,N02A
9,1000014,1115008,112,G02CC02,G02C


### cluster the cleaned dataset with atc level 2

In [149]:
# Extract the 2nd level ATC codes (first 3 characters) and remove duplicates
cleaned_df["atc_level2"] = (
    cleaned_df["atc_level3"]
    .str.split(",")
    .apply(
        lambda x: ",".join(
            sorted(set(code[:3] for code in x))
        )  # Using first 3 characters instead of 4
    )
)

In [150]:
cleaned_df.head(10)

Unnamed: 0,eid,drug_concept_id,duration,atc_code,atc_level3,atc_level2
0,1000014,710062,36,N06AA09,N06A,N06
1,1000014,723013,18,N05BA01,N05B,N05
2,1000014,836715,14,N05CD07,N05C,N05
3,1000014,915981,30,"A01AB08,A07AA01,B05CA09,D06AX04,J01GB05,R02AB0...","A01A,A07A,B05C,D06A,J01G,R02A,S01A,S02A,S03A","A01,A07,B05,D06,J01,R02,S01,S02,S03"
4,1000014,920293,18,J01XE01,J01X,J01
5,1000014,920458,55,"A07EA04,C05AA05,D07AC01,D07XC01,H02AB01,R01AD0...","A07E,C05A,D07A,D07X,H02A,R01A,R03B,S01B,S01C,S...","A07,C05,D07,H02,R01,R03,S01,S02,S03"
6,1000014,923645,407,A02BC01,A02B,A02
7,1000014,929549,30,"G01AD02,S02AA10","G01A,S02A","G01,S02"
8,1000014,1103314,64,N02AX02,N02A,N02
9,1000014,1115008,112,"G02CC02,M01AE02,M02AA12","G02C,M01A,M02A","G02,M01,M02"


In [151]:
# Create expanded dataframe with distributed durations
expanded_records = []
for _, row in cleaned_df.iterrows():
    atc_codes = row["atc_level2"].split(",")  # Using atc_level2 instead of atc_level3
    duration_per_code = row["duration"] / len(atc_codes)
    for atc_code in atc_codes:
        expanded_records.append(
            {
                "eid": row["eid"],
                "atc_level2": atc_code.strip(),  # Changed field name to atc_level2
                "duration": duration_per_code,
            }
        )
expanded_df = pd.DataFrame(expanded_records)

In [152]:
# Create pivot table and normalize
patient_features = expanded_df.pivot_table(
    index="eid",
    columns="atc_level2",  # Changed to atc_level2
    values="duration",
    aggfunc="sum",
    fill_value=0,
)

In [153]:
patient_features.head(10)

atc_level2,A01,A02,A03,A04,A06,A07,A08,A09,A10,A11,...,R02,R03,R05,R06,S01,S02,S03,V03,V04,V06
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000014,7.666667,407.0,81.0,30.0,0.0,16.944444,0.0,0.0,0.0,0.0,...,3.333333,23.611111,30.0,0.0,93.277778,27.777778,12.777778,0.0,0.0,0.0
1000023,5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30.0,0.0,0.0,16.0,6.0,6.0,0.0,0.0,0.0
1000041,0.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,28.0,15.0,0.0,0.0,0.0,0.0,0.0
1000062,70.0,0.0,0.0,0.0,0.0,48.5,0.0,0.0,0.0,0.0,...,0.0,722.833333,0.0,38.0,104.0,1.0,1.0,0.0,0.0,0.0
1000077,0.0,304.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
1000086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.333333,0.0,0.0,0.0,0.0,0.0
1000095,6.0,14.0,0.0,30.0,0.0,0.0,0.0,30.0,0.0,0.0,...,6.0,0.0,60.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0
1000181,120.0,3302.0,19.0,0.0,0.0,61.5,0.0,0.0,0.0,0.0,...,0.0,211.5,0.0,4924.0,163.0,10.0,10.0,0.0,0.0,0.0
1000197,1.666667,0.0,0.0,0.0,0.0,37.666667,0.0,0.0,0.0,0.0,...,0.0,250.0,0.0,0.0,3.416667,3.416667,3.416667,0.0,0.0,0.0
1000229,7.083333,56.0,0.0,0.0,0.0,51.083333,0.0,0.0,0.0,0.0,...,7.083333,397.75,0.0,40.0,17.083333,7.083333,7.083333,0.0,0.0,0.0


In [155]:
# Normalize the features
scaler = StandardScaler()
patient_features_scaled = scaler.fit_transform(patient_features)

# Perform K-means clustering
n_clusters = 15
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(patient_features_scaled)

# Create and analyze results
clustering_results = pd.DataFrame({"eid": patient_features.index, "cluster": clusters})

# Analyze characteristic drugs for each cluster
cluster_centers = pd.DataFrame(
    kmeans.cluster_centers_, columns=patient_features.columns
)

In [156]:
print(clustering_results["cluster"].value_counts())

cluster
0     103877
12     33946
5      18924
1       5542
9       5402
10      5028
6       3091
3       2377
2       2318
7       1263
8        809
13       302
4        238
11       111
14        11
Name: count, dtype: int64


In [157]:
# For each cluster, find the top 5 most characteristic ATC codes
for i in range(n_clusters):
    print(f"\nTop 5 characteristic drugs for Cluster {i}:")
    top_atc = cluster_centers.iloc[i].sort_values(ascending=False).head(5)
    print(top_atc)


Top 5 characteristic drugs for Cluster 0:
atc_level2
N03   -0.017609
A04   -0.017894
A09   -0.019229
V06   -0.020769
P03   -0.023124
Name: 0, dtype: float64

Top 5 characteristic drugs for Cluster 1:
atc_level2
A12    3.746451
A11    3.377370
B03    1.669029
V04    1.575198
B05    1.229826
Name: 1, dtype: float64

Top 5 characteristic drugs for Cluster 2:
atc_level2
L02    7.541939
G03    1.400356
H03    0.195965
A12    0.143807
N06    0.134750
Name: 2, dtype: float64

Top 5 characteristic drugs for Cluster 3:
atc_level2
S02    4.783780
H02    4.389960
S03    4.369214
G01    4.107241
D06    3.647340
Name: 3, dtype: float64

Top 5 characteristic drugs for Cluster 4:
atc_level2
B02    19.498997
B03     1.039184
R06     0.449772
N06     0.377306
G03     0.373847
Name: 4, dtype: float64

Top 5 characteristic drugs for Cluster 5:
atc_level2
C09    1.579657
C03    1.455845
C10    1.238326
C08    1.227044
C07    0.983313
Name: 5, dtype: float64

Top 5 characteristic drugs for Cluster 6:
atc_

In [158]:
# Clustering Quality Metrics
silhouette = silhouette_score(patient_features_scaled, clusters)
calinski = calinski_harabasz_score(patient_features_scaled, clusters)
davies = davies_bouldin_score(patient_features_scaled, clusters)

print("\nClustering Quality Metrics:")
print(f"Silhouette Score: {silhouette:.3f} (higher is better, range: -1 to 1)")
print(f"Calinski-Harabasz Score: {calinski:.3f} (higher is better)")
print(f"Davies-Bouldin Score: {davies:.3f} (lower is better)")


Clustering Quality Metrics:
Silhouette Score: 0.068 (higher is better, range: -1 to 1)
Calinski-Harabasz Score: 3961.691 (higher is better)
Davies-Bouldin Score: 2.173 (lower is better)


### cluster the cleaned, 1 atc code per drug_era dataset

In [162]:
cleaned_df = pd.read_csv("../../dataset/df_cleaned_1atc_sumduration.tsv", sep="\t")
cleaned_df.head()

Unnamed: 0,eid,drug_concept_id,duration,atc_code,atc_level3
0,1000014,710062,36,N06AA09,N06A
1,1000014,723013,18,N05BA01,N05B
2,1000014,836715,14,N05CD07,N05C
3,1000014,915981,30,A01AB08,A01A
4,1000014,920293,18,J01XE01,J01X


In [163]:
len(cleaned_df)

3069425

In [164]:
patient_features = cleaned_df.pivot_table(
    index="eid", columns="atc_level3", values="duration", aggfunc="sum", fill_value=0
)

In [171]:
patient_features.head()

atc_level3,A01A,A02A,A02B,A03A,A03B,A03F,A04A,A06A,A07A,A07D,...,R01A,R01B,R02A,R03A,R03D,R05C,R05D,R06A,S01E,S01K
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000014,65,0,407,71,10,0,30,0,0,0,...,0,0,0,0,0,0,30,0,0,0
1000023,11,0,0,0,0,0,0,0,0,0,...,0,0,0,30,0,0,0,0,0,0
1000041,0,0,56,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,28,0,0
1000062,140,0,0,0,0,0,0,0,0,0,...,0,0,0,341,0,0,0,66,65,0
1000077,0,0,304,0,0,10,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
patient_features_scaled = scaler.fit_transform(patient_features)

In [166]:
from sklearn.cluster import KMeans

n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(patient_features_scaled)

In [169]:
clustering_results = pd.DataFrame({"eid": patient_features.index, "cluster": clusters})

In [175]:
clustering_results.to_csv("../../dataset/clustering.tsv", sep="\t", index=False)

In [170]:
# Basic analysis of clusters
print("\nCluster sizes:")
print(clustering_results["cluster"].value_counts())


Cluster sizes:
cluster
9     101020
5      39677
15     20241
10      4866
0       4538
2       3435
8       2559
3       1762
13      1123
1       1095
4        470
7        431
17       398
11       373
18       372
12       293
16       210
6        174
19       107
14        95
Name: count, dtype: int64


In [172]:
# Analyze characteristic drugs for each cluster
cluster_centers = pd.DataFrame(
    kmeans.cluster_centers_, columns=patient_features.columns
)

# For each cluster, find the top 5 most characteristic ATC codes
for i in range(n_clusters):
    print(f"\nTop 5 characteristic drugs for Cluster {i}:")
    top_atc = cluster_centers.iloc[i].sort_values(ascending=False).head(5)
    print(top_atc)


Top 5 characteristic drugs for Cluster 0:
atc_level3
R03A    4.222110
D07A    3.604766
A07E    3.230787
R03D    1.897080
R06A    1.508165
Name: 0, dtype: float64

Top 5 characteristic drugs for Cluster 1:
atc_level3
R02A    7.689938
D09A    6.804525
G01A    4.272243
A07A    2.562104
D06A    2.010271
Name: 1, dtype: float64

Top 5 characteristic drugs for Cluster 2:
atc_level3
A10B    5.923969
C10A    1.930149
C09A    1.411283
A08A    1.024828
A01A    0.969999
Name: 2, dtype: float64

Top 5 characteristic drugs for Cluster 3:
atc_level3
C02C    8.809059
C09A    1.600713
C08C    1.565068
C03A    1.413899
C09C    1.245874
Name: 3, dtype: float64

Top 5 characteristic drugs for Cluster 4:
atc_level3
N05C    16.691912
N06A     2.056238
N02A     1.628114
N05B     1.370832
A02B     0.891037
Name: 4, dtype: float64

Top 5 characteristic drugs for Cluster 5:
atc_level3
J07A    1.377143
G03A    0.427962
G03C    0.421643
P01A    0.321749
G03D    0.193586
Name: 5, dtype: float64

Top 5 characteri

In [173]:
# Clustering Quality Metrics
silhouette = silhouette_score(patient_features_scaled, clusters)
calinski = calinski_harabasz_score(patient_features_scaled, clusters)
davies = davies_bouldin_score(patient_features_scaled, clusters)

print("\nClustering Quality Metrics:")
print(f"Silhouette Score: {silhouette:.3f} (higher is better, range: -1 to 1)")
print(f"Calinski-Harabasz Score: {calinski:.3f} (higher is better)")
print(f"Davies-Bouldin Score: {davies:.3f} (lower is better)")


Clustering Quality Metrics:
Silhouette Score: 0.066 (higher is better, range: -1 to 1)
Calinski-Harabasz Score: 2196.631 (higher is better)
Davies-Bouldin Score: 2.244 (lower is better)
