In [2]:
import pandas as pd
import os
import json

## 1 - Load the training data

In [3]:
file_path = os.path.join('..', 'data', 'cleaned')

train_df = pd.read_csv(os.path.join(file_path, 'train.csv'), low_memory=False)
train_df.head()

Unnamed: 0,file,VMONTH,VYEAR,VDAYR,YEAR,AGE,SEX,ETHNIC,RACE,USETOBAC,...,PHYSASST,NPNMW,RNLPN,OTHPROV,MHP,NODISP,REFOTHMD,RETAPPT,OTHDISP,ERADMHOS
0,opd2006.csv,December,2006.0,Friday,2006.0,55.0,Male,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No
1,opd2006.csv,November,2006.0,Thursday,2006.0,66.0,Male,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No
2,opd2006.csv,November,2006.0,Wednesday,2006.0,71.0,Female,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,Yes,No,No,No
3,opd2006.csv,November,2006.0,Tuesday,2006.0,1.0,Female,Not Hispanic or Latino,White Only,Not current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No
4,opd2006.csv,November,2006.0,Monday,2006.0,21.0,Female,Not Hispanic or Latino,White Only,Current,...,No,No,No,No,,One or more dispositions marked,No,No,No,No


## 2 - Load the vairiables dictionary and define features for clustering

In [4]:
# Load the variables dictionary
with open(os.path.join(file_path, 'variables.json'), 'r') as f:
    variables = json.load(f)

print(f'Variable Categories:\n')
for category, list in variables.items():
    print(f'{category}')
    print(f'{list}')

Variable Categories:

dateOfVisit
['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']
demographics
['AGE', 'SEX', 'ETHNIC', 'RACE', 'USETOBAC']
payment
['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'PAYTYPER']
visitReason
['INJDET', 'INJURY', 'MAJOR', 'RFV1', 'RFV2', 'RFV3']
patientClinicHistory
['SENBEFOR', 'PASTVIS']
vitalSigns
['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']
imputedFields
['BDATEFL', 'SEXFL', 'SENBEFL', 'PASTFL']
physicianDiagnoses
['DIAG1', 'DIAG2', 'DIAG3']
differentialDiagnoses
['PRDIAG1', 'PRDIAG2', 'PRDIAG3']
presentSymptomsStatus
['ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'NOCHRON', 'TOTCHRON', 'DMP']
services
['BREAST', 'PELVIC', 'RECTAL', 'SKIN', 'DEPRESS', 'BONEDENS', 'MAMMO', 'MRI', 'ULTRASND', 'XRAY', 'OTHIMAGE', 'CBC', 'ELECTROL', 'GLUCOSE', 'HGBA', 'CHOLEST', 'PSA', 'OTHERBLD', 'BIOPSY', 'CHLAMYD', 'PAPCONV', 'PAPLIQ', 'P


### 2.1 Defining features for clustering

##### !!! The statistical test result of the features should be referred first

In [5]:
train_df[variables['visitReason']].value_counts()

INJDET                          INJURY  MAJOR                                 RFV1                               RFV2                                     RFV3                                    
None of the above               No      Chronic problem, routine              Progress visit, NOS                Psychotherapy                            Medication, other and unspecified kinds     43
                                                                              Diabetes mellitus                  Hypertension                             Other endocrine, nutritional, and met...    29
Unintentional injury/poisoning  Yes     Chronic problem, routine              Alcoholism                         Progress visit, NOS                      Psychotherapy                               25
None of the above               No      Chronic problem, routine              Progress visit, NOS                Psychotherapy                            Group counseling                            23
 

In [6]:
# Defining the independent variables as features for clustering
features = \
    ['AGE', 'SEX'] + variables['visitReason'] + ['PASTVIS'] + variables['vitalSigns'] \
    + [item for item in variables['presentSymptomsStatus'] if item not in ['NOCHRON', 'TOTCHRON']]

print(f'Features: {features}')
print(f'Number of Features: {len(features)}')

Features: ['AGE', 'SEX', 'INJDET', 'INJURY', 'MAJOR', 'RFV1', 'RFV2', 'RFV3', 'PASTVIS', 'HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS', 'ARTHRTIS', 'ASTHMA', 'CANCER', 'CASTAGE', 'CEBVD', 'CHF', 'CRF', 'COPD', 'DEPRN', 'DIABETES', 'HYPLIPID', 'HTN', 'IHD', 'OBESITY', 'OSTPRSIS', 'DMP']
Number of Features: 31


In [7]:
clustering_df = train_df.loc[:, features]

## 3 - Preprocess and engineer the features

### 3.1 - Bin/Normalize quantitative features
Need to try out results with Binning

#### 3.1.1 - Binning

#### 3.1.2 - Normalization

In [8]:
# Get a list of quantitative features
quantitative_features = [feature for feature in features if clustering_df[feature].dtype != 'object']

# Normalize quantitative features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
clustering_df[quantitative_features] = scaler.fit_transform(clustering_df[quantitative_features])

# Check the result
clustering_df[quantitative_features].head()

Unnamed: 0,AGE,PASTVIS,HTIN,WTLB,BMI,TEMPF,BPSYS,BPDIAS
0,0.710117,-0.016688,0.530558,1.762024,1.904835,-0.230443,1.800628,1.390837
1,1.177478,-0.389377,0.609827,0.586196,0.243487,-0.53371,-0.678966,-0.880536
2,1.389914,0.281463,-0.103596,0.329405,0.924274,-1.34242,1.304709,-0.231572
3,-1.584197,-0.389377,-1.926787,-1.670855,,0.881535,,
4,-0.734451,0.281463,0.372019,-0.035507,-0.310296,-0.230443,-0.678966,-1.042777


### 3.2 - Dimensionality reduction for quantitative features

### 3.3 - Encode categorical features

In [9]:
# Get a list of categorical features
categorical_features = [feature for feature in features if feature not in quantitative_features]

# One-hot encode categorical features
clustering_df = pd.get_dummies(clustering_df, columns=categorical_features, drop_first=True)

# Get the list of encoded features
encoded_features = [feature for feature in clustering_df.columns if feature not in quantitative_features + categorical_features]

# Check the result
clustering_df[encoded_features].head()

Unnamed: 0,SEX_Male,INJDET_Adverse effect of med care/surg care/medicinal drug,INJDET_Injury/poisoning - unknown intent,INJDET_Intentional injury/poisoning,INJDET_None of the above,INJDET_Unintentional injury/poisoning,INJURY_Yes,"MAJOR_Chronic problem, flare-up","MAJOR_Chronic problem, routine",MAJOR_New problem (less than 3 mos. onset),...,DEPRN_Yes,DIABETES_Yes,HYPLIPID_Yes,HTN_Yes,IHD_Yes,OBESITY_Yes,OSTPRSIS_Yes,DMP_Not applicable,DMP_Not enrolled,DMP_Ordered/advised to enroll at this visit
0,True,False,False,False,False,True,True,False,False,False,...,False,True,True,False,False,True,False,False,False,False
1,True,False,False,False,True,False,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,False,True,True,True,False,True,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False


### 3.4 - Dimensionality reduction for categorical features

### 3.5 - Redefine the clusterring DataFrame for training

In [10]:
# Redefine the clustering DataFrame
clustering_df = clustering_df.loc[:, quantitative_features + encoded_features]

## 4 - Baseline clustering model

### 4.1 - Train models

#### 4.1.1 - KMeans

In [11]:
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [12]:
random_seed = 42

In [13]:
# Impute the missing quantitative values using KNN
n_neighbors = 5
imputer = KNNImputer(n_neighbors=n_neighbors)

clustering_df_imputed = imputer.fit_transform(clustering_df[quantitative_features])

In [14]:
# Concatenate the imputed quantitative features with the encoded categorical features
clustering_df_imputed = pd.concat([
    pd.DataFrame(clustering_df_imputed, columns=quantitative_features), 
    clustering_df[encoded_features]
], axis=1)

In [15]:
# Clustering using KMeans
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=random_seed)
train_df['cluster'] = kmeans.fit_predict(clustering_df_imputed)

# Check the result
train_df['cluster'].value_counts()

cluster
5    22533
7    16102
6    12755
1    12475
0     9290
8     9068
9     7240
4     6520
2     4980
3     2523
Name: count, dtype: int64

### 4.2 - Evaluation of the model

In [27]:
import altair as alt
import vegafusion as vf
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

#### 4.2.1 - Metrics

In [19]:
# Calculate the Silhouette Score
silhouette = silhouette_score(clustering_df_imputed, train_df['cluster'])
print(f'Silhouette Score: {silhouette}')

Silhouette Score: 0.060936523975194855


In [20]:
# Calculate the sum of squared distances of samples to their closest cluster center
print(f'Sum of Squared Distances: {kmeans.inertia_}')

Sum of Squared Distances: 617995.8133063597


#### 4.2.2 - Examine cluster centroids

In [25]:
# Check the cluster centers
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=clustering_df_imputed.columns)

# Inverse transform the cluster centers
cluster_centers[quantitative_features] = scaler.inverse_transform(cluster_centers[quantitative_features])

# Check the result
cluster_centers

Unnamed: 0,AGE,PASTVIS,HTIN,WTLB,BMI,TEMPF,BPSYS,BPDIAS,SEX_Male,INJDET_Adverse effect of med care/surg care/medicinal drug,...,DEPRN_Yes,DIABETES_Yes,HYPLIPID_Yes,HTN_Yes,IHD_Yes,OBESITY_Yes,OSTPRSIS_Yes,DMP_Not applicable,DMP_Not enrolled,DMP_Ordered/advised to enroll at this visit
0,61.932285,4.606384,65.75795,170.538163,27.714171,97.674985,114.523307,64.730757,0.389601,0.006566907,...,0.156529,0.239746,0.198514,0.383895,0.075035,0.063408,0.043923,0.139843,0.186349,0.015072
1,27.161657,3.723956,64.750244,167.186359,27.660375,98.067717,111.619187,66.025775,0.054821,0.0007213272,...,0.048489,0.040474,0.004969,0.019396,0.000882,0.049291,0.001603,0.774786,0.050974,0.007534
2,3.173092,3.110803,35.30245,32.738916,17.820432,99.294253,103.078153,61.177028,0.533333,0.002610442,...,0.004217,0.003614,0.000402,0.002209,0.002209,0.009036,0.000402,0.848795,0.035944,0.00261
3,46.540626,37.039715,65.728339,177.417915,28.460954,97.875997,126.118827,74.620055,0.464923,-2.2117720000000003e-17,...,0.308759,0.122077,0.052715,0.191835,0.028537,0.070155,0.014665,0.324217,0.085216,0.00436
4,52.476074,3.065322,66.508497,194.332883,30.772685,97.936666,156.592178,92.716779,0.447546,0.00506135,...,0.109202,0.193865,0.158896,0.56181,0.039417,0.105828,0.017485,0.200153,0.184816,0.019479
5,40.084191,2.849365,66.451758,186.220016,29.253656,98.041448,125.358051,78.00814,0.436845,0.003905557,...,0.140644,0.051571,0.02796,0.075493,0.004039,0.036082,0.003506,0.586055,0.059427,0.011939
6,17.968169,3.091595,62.023599,119.74109,21.493248,98.027265,110.873697,66.059004,0.430498,0.003136025,...,0.092591,0.025951,0.005958,0.015837,0.001333,0.023834,0.000941,0.678871,0.066719,0.007605
7,67.058129,3.838616,65.598895,174.991628,28.534663,97.7182,138.186362,75.363657,0.388026,0.003788349,...,0.135387,0.27214,0.229909,0.522171,0.065955,0.056515,0.050677,0.119675,0.16321,0.010247
8,3.050066,2.934715,35.598213,32.252933,17.013549,97.612667,99.243119,59.667005,0.541575,0.002315836,...,0.001875,0.003639,0.001213,0.003419,0.001985,0.010476,0.000331,0.835576,0.042347,0.002316
9,45.961464,3.513356,66.155497,271.742514,43.286694,97.9445,129.789503,76.972845,0.337845,0.00441989,...,0.151381,0.340193,0.195994,0.469475,0.039917,0.484669,0.01174,0.125552,0.182182,0.026105


#### 4.2.3 - Visualization

## 5 - Extract text features from each cluster

In [None]:
# Import NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


#### 5.1 - Aggregate text data by cluster

In [53]:
for i, (k, v) in zip (range(len(variables)), variables.items()):
    if i <= 5:
        print(k, v)

dateOfVisit ['VMONTH', 'VYEAR', 'VDAYR', 'YEAR']
demographics ['AGE', 'SEX', 'ETHNIC', 'RACE', 'USETOBAC']
payment ['PAYPRIV', 'PAYMCARE', 'PAYMCAID', 'PAYWKCMP', 'PAYSELF', 'PAYNOCHG', 'PAYOTH', 'PAYDK', 'PAYTYPER']
visitReason ['INJDET', 'INJURY', 'MAJOR', 'RFV1', 'RFV2', 'RFV3']
patientClinicHistory ['SENBEFOR', 'PASTVIS']
vitalSigns ['HTIN', 'WTLB', 'BMI', 'TEMPF', 'BPSYS', 'BPDIAS']
