In [1]:
# Import all packages here
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns 
import random
from datetime import datetime
from tslearn.clustering import TimeSeriesKMeans
from tslearn.clustering import silhouette_score
import time
import os
import glob

In [2]:
cwd_path = os.getcwd()
csv_files = glob.glob(os.path.join(cwd_path,'../fbc', "*.csv"))

In [5]:
def change_to_list(row):
    amplitudes_string = row['amplitudes'][1:-1]
    amplitude_list = amplitudes_string.split(',')

    map_object = map(int, amplitude_list)
    amplitude_list_int = list(map_object)

    return amplitude_list_int

def get_clustering_results(distance_metric = 'euclidean', max_iter = 10, variation_param = 'stddev'):

    ctr = 1
    for f in csv_files:

        # read the csv file
        df = pd.read_csv(f)

        df['collection_time'] = pd.to_datetime(df['collection_time'], format= '%Y-%m-%d %H:%M:%S')
        df['amplitudes'] = df.apply(lambda x: change_to_list(x), axis=1)

        final_data = pd.DataFrame(df['amplitudes'])
        amps = pd.DataFrame(df['amplitudes'].to_list())

        MAX_ITER = max_iter
        DISTANCE_METRIC = distance_metric
        VARIATION_PARAM = variation_param

        # We select best K through best Silhouette score after clustering with different values of K (2 <= K <= 10) 

        silhouette_score_list = []
        dtw_kmeans_final_model = TimeSeriesKMeans()
        max_silhouette_score = 0
        y_pred_final = []

        amps_final = amps.copy()

        for k in np.arange(2,11,1):    
            dtw_kmeans_model_temp = TimeSeriesKMeans(n_clusters=k, metric=DISTANCE_METRIC, 
                                                     max_iter=MAX_ITER, random_state = 101)

            # Special Handling for dtw
            if (DISTANCE_METRIC == 'dtw'):
                amps_new = amps.copy()
                variation_list = []

                length = int(amps_new.shape[1])
                for i in np.arange(0, length, 1):
                    df_temp = amps_new.iloc[: , i:i+1]
                    value_list = df_temp.values.tolist()
                    if (VARIATION_PARAM == 'range'):
                        variation_ = max(value_list)[0] - min(value_list)[0]
                    elif (VARIATION_PARAM == 'stddev'):
                        variation_ = np.std(value_list)
                    variation_list.append(variation_)

                amps_final_new = amps_new.copy().T
                amps_final_new['range'] = range_list
                amps_final_new = amps_final_new.sort_values(by=['range'], ascending=False).drop(['range'],axis=1).reset_index(drop=True).T
                amps_final = amps_final_new.iloc[: , :2000]

            amps_temp = amps_final.copy()
            y_pred_temp = dtw_kmeans_model_temp.fit_predict(amps_temp)
            amps_temp['combined'] = amps_temp.values.tolist()
            amps_temp['y_pred'] = y_pred_temp
            score_temp = silhouette_score(X = amps_temp.iloc[: , :8704], labels = y_pred_temp, metric = DISTANCE_METRIC)
            silhouette_score_list.append(score_temp)
            max_silhouette_score = max(silhouette_score_list)
            if (max_silhouette_score == score_temp):
                dtw_kmeans_final_model = dtw_kmeans_model_temp
                y_pred_final = y_pred_temp

        NUM_CLUSTERS = silhouette_score_list.index(max_silhouette_score)+2

        final_data['y_pred'] = y_pred_final
        final_data['cluster_center'] = final_data['y_pred'].apply(lambda x:
                                        np.concatenate(dtw_kmeans_final_model.cluster_centers_[x]).ravel().tolist())
        
        final_data['device_id'] = df['device_id']
        reordered_cols = ['device_id', 'amplitudes', 'y_pred', 'cluster_center']
        final_data = final_data[reordered_cols]

        result_path = cwd_path + '/Clustering Results/'
        result_path += f.split('/')[-1].split('.')[0] + '_result.csv'

        print("Writing file #{} : {}_result.csv ...".format(ctr, f.split('/')[-1].split('.')[0]))
        
        ctr += 1

        final_data.to_csv(result_path, index=False)

In [6]:
###########################
### SET ALL PARAMS HERE ###
###########################

# MAX_ITER_ is the maximum number of iterations for clustering before declaring convergence
# DISTANCE_METRIC_ = euclidean/dtw
# NUM_CLUSTERS_ is set automatically based on best silhouette score
# VARIATION_PARAM_ = range/stddev

MAX_ITER_ = 10
DISTANCE_METRIC_ = 'euclidean'
VARIATION_PARAM_ = 'stddev'

get_clustering_results(DISTANCE_METRIC_, MAX_ITER_, VARIATION_PARAM_)

Writing file #1 : 3VAE2_result.csv ...
Writing file #2 : H070C_result.csv ...
Writing file #3 : 3VAE1_result.csv ...
Writing file #4 : H070A_result.csv ...
Writing file #5 : W056E_result.csv ...
Writing file #6 : H071A_result.csv ...
Writing file #7 : 3VAH2_result.csv ...
Writing file #8 : H030C_result.csv ...
Writing file #9 : MGABT_result.csv ...
Writing file #10 : H071B_result.csv ...
Writing file #11 : 3VAH1_result.csv ...
Writing file #12 : 3VAC1_result.csv ...
Writing file #13 : MM103_result.csv ...
Writing file #14 : 3WBM1_result.csv ...
Writing file #15 : 3VAC2_result.csv ...
Writing file #16 : MM104_result.csv ...
Writing file #17 : MM106_result.csv ...
Writing file #18 : ATLA1_result.csv ...
Writing file #19 : MM122_result.csv ...
Writing file #20 : H0182_result.csv ...
Writing file #21 : W091A_result.csv ...
Writing file #22 : MC050_result.csv ...
Writing file #23 : MM109_result.csv ...
Writing file #24 : MM119_result.csv ...
Writing file #25 : W012B_result.csv ...
Writing f