This is code for KMeans Clustering of time series data using Dynamic Time Warping (DTW). Patient vitals, labs and dx were combined from ICU data. Each .csv file is by hour and each patient has 4 time points (15-minute intervals) within that hour where data is recorded.

In [None]:
# import packages

import os
import pandas as pd
from glob import glob 
import numpy as np
import time
import math
import impyute as impy

from tslearn.utils import to_time_series
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn import metrics

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt


In [None]:
# read and append vitals data

os.chdir("/path")
strain = glob("*.csv")

vitals = pd.DataFrame()

for filename in strain:
    df = pd.read_csv(filename) 
    vitals = vitals.append(df)

In [None]:
#vitals.to_csv('/path/vitals.csv') # write to .csv
#vitals = pd.read_csv('/path/vitals.csv') # upload
vitals.shape

In [None]:
## read and append labs data

os.chdir("/path")
strain = glob("*.csv")

labs = pd.DataFrame()

for filename in strain:
    df = pd.read_csv(filename)
    labs = labs.append(df)

In [None]:
#labs.to_csv('/path/labs.csv')
#labs = pd.read_csv('/path/labs.csv')
labs.shape

In [None]:
# read and append dx data

os.chdir("/path")
strain = glob("*.csv")

dx = pd.DataFrame()

for filename in strain:
    df = pd.read_csv(filename)
    dx = dx.append(df)

In [None]:
#dx.to_csv('/path/dx.csv')
#dx = pd.read_csv('/path/dx.csv')
dx.shape

In [None]:
# merge data for KMeans clustering and prediction

vl = pd.merge(vitals, labs, on = ['feature1', 'feature2', 'feature3'], how = 'left')
X = pd.merge(vl, dx, on = ['feature1', 'feature2'], how = 'left')

In [None]:
#X.to_csv('/path/X.csv')
#X = pd.read_csv('/path/X.csv')
X.shape

In [None]:
# pre-format dataframe

# unsupervised
X.drop_duplicates(inplace = True)
X.drop(columns = ['a', 'b', 'c'], inplace = True)
X.astype({'d': 'int', 'e': 'int', 'f': 'int'})
X.dropna(axis = 1, thresh = n, inplace = True)
X = X.sort_values(by = ['feature1', 'feature2'])

# make X and y dataset split
#y = X.pop('label')

In [None]:
# prepare for imputation

Xv = X.values # make an array
Xv = impy.median(Xv)

# convert back to dataframe
X = pd.DataFrame(Xv, columns = X.columns.values.tolist())
X.fillna(method='ffill')

In [None]:
# make 3D [pt,ts,d] numpy array for KMeans

km_arr=np.zeros((patient, time_series, features))

i = 0
j = 0
k = time_seris
for x in range(1000):
    km_arr[i] = X.iloc[range(j, k)].values
    i += 1
    j += time_series
    k += time_series

In [None]:
#DTW K-means sqrt n clusters

# timer
start = time.time()

#K-means
km = TimeSeriesKMeans(n_clusters=30, metric = 'dtw', verbose=True, random_state=42)

y_pred = km.fit_predict(km_arr)

# timer
end = time.time()
print(end - start)

In [None]:
# Unique counts per cluster

unique, counts = np.unique(y_pred, return_counts=True)
print(unique, counts)

In [None]:
# labels

y_pred

In [None]:
# visualize the clusters

sz = km_arr.shape[1]

plt.figure()
for yi in range(30):
    plt.subplot(10, 3, yi + 1)
    for xx in km_arr[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Euclidean $k$-means")

In [None]:
# silhouette_score

silhouette_score(km_arr, km.labels_)