In [3]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats 

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../../data/processed_data.csv'
subject101_file_path = os.getcwd() + f'/../../data/output_csv/subject104_processed.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [5]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path, header=None, names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,0.418461,0.925518,0.472759,-0.035942,0.138344,-0.011430
1,37.67,1,0.432534,0.752532,0.514698,-0.556016,0.093168,-0.006729
2,37.68,1,0.432340,0.601171,0.535716,-0.763257,0.039907,0.023608
3,37.69,1,0.418957,0.601528,0.555967,-0.622059,0.069178,0.060307
4,37.70,1,0.434205,0.634318,0.607554,-0.239073,-0.070386,0.034582
...,...,...,...,...,...,...,...,...
288214,974.50,3,-0.356667,0.023598,1.269533,-0.233579,-0.058947,-0.165780
288215,974.51,3,-0.357490,0.101736,1.209994,-0.369190,0.018471,-0.029929
288216,974.52,3,-0.350256,0.068348,1.257699,-0.585796,-0.012709,-0.046238
288217,974.53,3,-0.353363,-0.022715,1.115753,-0.712105,0.084094,-0.053992


In [6]:
subject101_df = pd.read_csv(subject101_file_path, names=col_names, skiprows=1)
subject101_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,75.25,1,-8.66987,3.18279,3.17092,0.052647,-0.019979,-0.012254
1,75.26,1,-8.66479,3.33555,3.28594,0.032087,0.004228,-0.000878
2,75.27,1,-8.63663,3.29497,3.01709,0.061452,0.006872,-0.014614
3,75.28,1,-8.52192,3.10507,3.09642,0.046607,-0.006672,-0.011067
4,75.29,1,-8.52085,3.25683,3.09592,0.037953,-0.026937,0.017411
...,...,...,...,...,...,...,...,...
73205,860.31,3,-9.06513,2.23454,2.86030,-0.007782,-0.046181,-0.043970
73206,860.32,3,-9.13990,2.15959,2.89792,-0.052233,0.008751,-0.025230
73207,860.33,3,-8.99623,2.23228,2.66890,-0.034450,0.017696,-0.045599
73208,860.34,3,-8.99650,2.19434,2.66902,-0.016559,0.031833,0.000214


# Generating subsequences for each sequence of the data

In [32]:
def elbow_techique(sub_sequence_data):
    
    # _only for manual testing
    distortions = []
    for k in range(1,10):
        kmeanModel = KMeans(n_clusters=k)
        kmeanModel.fit(sub_sequence_data)
        distortions.append(kmeanModel.inertia_)

    # _plot the distortions to observe the elbow point from the graph
    plt.figure(figsize=(16,8))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [33]:
def get_cluster_ids(sub_sequence_data, cluster_cnt):
    
    # _passing random state as an argument to make Kmeans clustering deterministic
    kMeans_model = KMeans(n_clusters = cluster_cnt,random_state=1234).fit(sub_sequence_data)
    return kMeans_model

In [34]:
def get_assigned_words(seq_clusters, cluster_words):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
            
    return assigned_words

In [35]:
def generate_cluster_names(sequence_names, cluster_cnt=100):
    
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+str(i) for i in range(cluster_cnt)]
        
    return words_dict

In [36]:
# _initializing variables
window_length = 10
window_overlap = int(window_length/2)

sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]
sub_sequences_101 = [[] for x in range(num_of_subsequences)]

In [37]:
window_index = 0
max_window_index = len(main_df.index)

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())
            
    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)

(6, 57624, 10)


In [38]:
window_index = 0
max_window_index = len(subject101_df.index)

while window_index <= (max_window_index - window_length):

    activity_sequence = subject101_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences_101[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences_101[idx].append(subject101_df[sequence_names[idx]][window_index:window_index+window_length].tolist())
            
    window_index += window_overlap

# _converting into numpy arrays
np_sequences_101 = np.asarray(sub_sequences_101[1:])
print(np_sequences_101.shape)

(6, 14638, 10)


# Finding the statistics of the clusters(mean,variance,skewness)


In [68]:
# _document dataframe to store word assignments of each window
doc_df = pd.DataFrame(columns=col_names[1:])
doc_df['activityID'] = sub_sequences_101[0]
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,,,,,,
1,1,,,,,,
2,1,,,,,,
3,1,,,,,,
4,1,,,,,,
...,...,...,...,...,...,...,...
14633,3,,,,,,
14634,3,,,,,,
14635,3,,,,,,
14636,3,,,,,,


In [69]:
cluster_cnts = 100
words_dict = generate_cluster_names(sequence_names[1:], cluster_cnts)
# print(words_dict)
# print()

# _after observing the clustering result with elbow method, we are manually choosing the n_clusters value.
# _testing: elbow_techique(np_sequences[0])
sequence_cluster_cnts = dict.fromkeys(words_dict, cluster_cnts)
print(sequence_cluster_cnts)

{'X1': 100, 'Y1': 100, 'Z1': 100, 'X2': 100, 'Y2': 100, 'Z2': 100}


In [70]:
for idx, (seq, cluster_cnt) in enumerate(sequence_cluster_cnts.items()):
    
    # _perform k means clustering on subsequences
    KMeans_models = get_cluster_ids(np_sequences[idx], cluster_cnt)
    seq_clusters = KMeans_models.predict(np_sequences_101[idx])
    
    cluster_words = words_dict[seq][:cluster_cnt]
    
    #print(f'{idx} -- {seq} -- {cluster_cnt} -- {cluster_words} -- {set(seq_clusters)}')
    
    # _get assigned words and fill the values in doc_df
    assigned_words = get_assigned_words(seq_clusters, cluster_words)
    doc_df[seq] = assigned_words

doc_df

In [71]:
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_43
1,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_43
2,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_0
3,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_0
4,1,X1_36,Y1_37,Z1_22,X2_10,Y2_68,Z2_43
...,...,...,...,...,...,...,...
14633,3,X1_36,Y1_96,Z1_22,X2_10,Y2_68,Z2_0
14634,3,X1_36,Y1_96,Z1_22,X2_10,Y2_68,Z2_0
14635,3,X1_36,Y1_96,Z1_22,X2_56,Y2_68,Z2_0
14636,3,X1_36,Y1_96,Z1_22,X2_56,Y2_68,Z2_0


In [72]:
# doc_df.drop_duplicates(keep=False, inplace=True)
# doc_df

# Finding the statistics of the clusters(mean,variance,skewness)

In [73]:
window_cols = ['Col_'+str(idx) for idx in range(1, window_length+2)]

In [74]:
def clust_subseq(X,i):
    d1=pd.DataFrame(doc_df[X])
    X1_df = pd.DataFrame(np_sequences_101[i])
    df=pd.concat([d1,X1_df],axis = 1)
    df.columns = window_cols.copy()
    return df

In [75]:
cluster_subseq =[]
di = {'X1':0,'Y1':1,'Z1':2,'X2':3,'Y2':4,'Z2':5}

for key,value in di.items():    
    last = clust_subseq(key,value)
    cluster_subseq.append(last)
cluster_subseq

[       Col_1    Col_2    Col_3    Col_4    Col_5    Col_6    Col_7    Col_8  \
 0      X1_36 -8.66987 -8.66479 -8.63663 -8.52192 -8.52085 -8.75748 -8.59750   
 1      X1_36 -8.75748 -8.59750 -8.58948 -8.71087 -8.63395 -8.71033 -8.70793   
 2      X1_36 -8.71033 -8.70793 -8.63422 -8.64224 -8.70953 -8.71167 -8.74518   
 3      X1_36 -8.71167 -8.74518 -8.81996 -8.86390 -8.59563 -8.59643 -8.48092   
 4      X1_36 -8.59643 -8.48092 -8.48039 -8.56025 -8.63368 -8.59750 -8.55971   
 ...      ...      ...      ...      ...      ...      ...      ...      ...   
 14633  X1_36 -9.22136 -9.22163 -8.99356 -9.03269 -9.10479 -9.11763 -9.10880   
 14634  X1_36 -9.11763 -9.10880 -9.22110 -9.10586 -9.07048 -9.22377 -8.99302   
 14635  X1_36 -9.22377 -8.99302 -9.02653 -9.14659 -9.03643 -9.02975 -9.03429   
 14636  X1_36 -9.02975 -9.03429 -9.13670 -9.14686 -9.14285 -9.06994 -9.14071   
 14637  X1_36 -9.06994 -9.14071 -9.10559 -9.07609 -9.02279 -9.06513 -9.13990   
 
          Col_9   Col_10   Col_11  
 0

In [76]:
def clusterstatistics(Y,X):
    
    cluster_df = Y.loc[(Y['Col_1'] == X)] 
    
    Columns = window_cols[1:]
    Mean = []
    Variance = []
    Skewness = []
    IQR = []

    for values in Columns:
        M = cluster_df [values].mean()
        V = cluster_df[values].var()
        S = cluster_df[values].skew()
        Iqr = stats.iqr(cluster_df [values], interpolation = 'midpoint') 
        
        Mean.append(M)
        Variance.append(V)
        Skewness.append(S)
        IQR.append(Iqr)
    
    stat_df=pd.DataFrame(Mean,columns=["Mean"])
    stat_df["Variance"]=Variance
    stat_df["Skewness"]=Skewness
    stat_df["IQR"]=IQR

    return  stat_df

In [77]:
statistic=[]
dic = {0:words_dict['X1'],
       1:words_dict['Y1'],
       2:words_dict['Z1'],
       3:words_dict['X2'],
       4:words_dict['Y2'],
       5:words_dict['Z2']
      }
for i in range(len(cluster_subseq)):
    cluster_names=dic[i]
    for j in range(len(cluster_names)):
        last1 = clusterstatistics(cluster_subseq[i],cluster_names[j])
        statistic.append(last1)
print(statistic)

[   Mean  Variance  Skewness  IQR
0   NaN       NaN       NaN  NaN
1   NaN       NaN       NaN  NaN
2   NaN       NaN       NaN  NaN
3   NaN       NaN       NaN  NaN
4   NaN       NaN       NaN  NaN
5   NaN       NaN       NaN  NaN
6   NaN       NaN       NaN  NaN
7   NaN       NaN       NaN  NaN
8   NaN       NaN       NaN  NaN
9   NaN       NaN       NaN  NaN,        Mean  Variance  Skewness       IQR
0 -0.145596  0.021248  0.590628  0.199119
1 -0.159025  0.014511 -0.634953  0.176913
2 -0.154974  0.006374  1.035071  0.124975
3 -0.174837  0.005408 -0.675412  0.045569
4 -0.208308  0.014049  0.960969  0.156496
5 -0.249097  0.004748 -0.008336  0.079244
6 -0.225776  0.008147 -0.452653  0.090559
7 -0.220171  0.007253 -1.599102  0.060600
8 -0.205068  0.034485  0.556785  0.212997
9 -0.177914  0.017453 -0.583786  0.060616,    Mean  Variance  Skewness  IQR
0   NaN       NaN       NaN  NaN
1   NaN       NaN       NaN  NaN
2   NaN       NaN       NaN  NaN
3   NaN       NaN       NaN  NaN
4   NaN

In [78]:
allvalues=[]
for i in range(len(statistic)):
    l=statistic[i].T
    allvalues.append(l)
    
allvalues1 = []
for i in range(len(allvalues)):
    aa = pd.DataFrame(allvalues[i].stack().to_frame().values).T
    allvalues1.append(aa)

In [79]:
embeddings_filepath = os.getcwd() + f'/../../data/sub_sequence_output/word_embeddings_from_clusters.txt'
pd.concat(allvalues1).to_csv(embeddings_filepath, index=False, header= False)

In [80]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2,final_sub_sequence
0,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_43,X1_36 Y1_37 Z1_22 X2_10 Y2_0 Z2_43
1,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_43,X1_36 Y1_37 Z1_22 X2_10 Y2_0 Z2_43
2,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_0,X1_36 Y1_37 Z1_22 X2_10 Y2_0 Z2_0
3,1,X1_36,Y1_37,Z1_22,X2_10,Y2_0,Z2_0,X1_36 Y1_37 Z1_22 X2_10 Y2_0 Z2_0
4,1,X1_36,Y1_37,Z1_22,X2_10,Y2_68,Z2_43,X1_36 Y1_37 Z1_22 X2_10 Y2_68 Z2_43
...,...,...,...,...,...,...,...,...
14633,3,X1_36,Y1_96,Z1_22,X2_10,Y2_68,Z2_0,X1_36 Y1_96 Z1_22 X2_10 Y2_68 Z2_0
14634,3,X1_36,Y1_96,Z1_22,X2_10,Y2_68,Z2_0,X1_36 Y1_96 Z1_22 X2_10 Y2_68 Z2_0
14635,3,X1_36,Y1_96,Z1_22,X2_56,Y2_68,Z2_0,X1_36 Y1_96 Z1_22 X2_56 Y2_68 Z2_0
14636,3,X1_36,Y1_96,Z1_22,X2_56,Y2_68,Z2_0,X1_36 Y1_96 Z1_22 X2_56 Y2_68 Z2_0


In [81]:
# _save the combined values to text files
for activity in doc_df['activityID'].unique():
    output_filepath = os.getcwd() + f'/../../data/sub_sequence_output/activity_subseq_' + str(activity) + '.txt'
    doc_df.loc[doc_df['activityID'] == activity][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)