In [1]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/processed_data.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path, header=None, names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,2.21530,8.27915,5.58753,-0.004750,0.037579,-0.011145
1,37.67,1,2.29196,7.67288,5.74467,-0.171710,0.025479,-0.009538
2,37.68,1,2.29090,7.14240,5.82342,-0.238241,0.011214,0.000831
3,37.69,1,2.21800,7.14365,5.89930,-0.192912,0.019053,0.013374
4,37.70,1,2.30106,7.25857,6.09259,-0.069961,-0.018328,0.004582
...,...,...,...,...,...,...,...,...
288214,974.50,3,-1.99794,3.94300,9.15686,-0.112651,-0.003501,-0.066523
288215,974.51,3,-2.00276,4.20689,8.96346,-0.179092,0.020300,-0.010637
288216,974.52,3,-1.96042,4.09413,9.11842,-0.285215,0.010714,-0.017346
288217,974.53,3,-1.97860,3.78659,8.65734,-0.347098,0.040476,-0.020536


In [4]:
print(pd.Series(main_df['timestamp (s)']).is_unique)

#performing normalization for X1,Y1,Z1,X2,Y2,Z2

def normalization(X):   
        normalized_values=(main_df[X]-main_df[X].mean())/main_df[X].std()
        return normalized_values

False


In [5]:
normalization('X1')

0         0.554688
1         0.567517
2         0.567339
3         0.555140
4         0.569040
            ...   
288214   -0.150352
288215   -0.151158
288216   -0.144073
288217   -0.147116
288218   -0.173261
Name: X1, Length: 288219, dtype: float64

# Generating subsequences for each sequence of the data

In [6]:
def elbow_techique(sub_sequence_data):
    
    # _only for manual testing
    distortions = []
    for k in range(1,10):
        kmeanModel = KMeans(n_clusters=k)
        kmeanModel.fit(sub_sequence_data)
        distortions.append(kmeanModel.inertia_)

    # _plot the distortions to observe the elbow point from the graph
    plt.figure(figsize=(16,8))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [7]:
def get_cluster_ids(sub_sequence_data, cluster_cnt):
    
    # _passing random state as an argument to make Kmeans clustering deterministic
    kmeans_model = KMeans(n_clusters = cluster_cnt,random_state=1234)
    return kmeans_model.fit_predict(sub_sequence_data)

In [8]:
def get_assigned_words(seq_clusters, cluster_words):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
            
    return assigned_words

In [9]:
def generating_numbers(sequence_names):
    
    # _each word contains common prefix of subsequence and unique cluster alphabet
    alphabet = 'ABCDEFGHIJ'
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+alphabet[i] for i in range(len(alphabet))]
        
    return words_dict

In [10]:
# _initializing variables
window_length = 10
window_overlap = 5
max_window_index = len(main_df.index)
sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [11]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)



(6, 57624, 10)


In [12]:
np_sequences


array([[[ 2.21530e+00,  2.29196e+00,  2.29090e+00, ...,  2.32815e+00,
          2.25096e+00,  2.14107e+00],
        [ 2.07165e+00,  2.41148e+00,  2.32815e+00, ...,  2.28661e+00,
          2.21585e+00,  2.24722e+00],
        [ 2.36727e+00,  2.43617e+00,  2.28661e+00, ...,  2.23039e+00,
          2.24697e+00,  2.14778e+00],
        ...,
        [-2.13404e+00, -2.20240e+00, -2.13324e+00, ..., -2.16862e+00,
         -2.06862e+00, -2.07183e+00],
        [-2.21443e+00, -2.13270e+00, -2.16862e+00, ..., -1.83386e+00,
         -1.79260e+00, -2.00730e+00],
        [-2.03003e+00, -1.75508e+00, -1.83386e+00, ..., -2.04108e+00,
         -2.14776e+00, -1.99794e+00]],

       [[ 8.27915e+00,  7.67288e+00,  7.14240e+00, ...,  7.63431e+00,
          7.78598e+00,  7.52262e+00],
        [ 7.25965e+00,  7.59780e+00,  7.63431e+00, ...,  7.67154e+00,
          7.59758e+00,  7.44419e+00],
        [ 7.63436e+00,  7.63210e+00,  7.67154e+00, ...,  6.94763e+00,
          6.83817e+00,  6.76686e+00],
        ...,


In [13]:
# _document dataframe to store word assignments of each window
doc_df = pd.DataFrame(columns=col_names[1:])
doc_df['activityID'] = sub_sequences[0]
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,,,,,,
1,1,,,,,,
2,1,,,,,,
3,1,,,,,,
4,1,,,,,,
...,...,...,...,...,...,...,...
57619,3,,,,,,
57620,3,,,,,,
57621,3,,,,,,
57622,3,,,,,,


In [14]:
words_dict = generating_numbers(sequence_names[1:])
print(words_dict)
print()

# _after observing the clustering result with elbow method, we are manually choosing the n_clusters value.
# _testing: elbow_techique(np_sequences[0])
sequence_cluster_cnts = {'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}
print(sequence_cluster_cnts)

{'X1': ['X1_A', 'X1_B', 'X1_C', 'X1_D', 'X1_E', 'X1_F', 'X1_G', 'X1_H', 'X1_I', 'X1_J'], 'Y1': ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D', 'Y1_E', 'Y1_F', 'Y1_G', 'Y1_H', 'Y1_I', 'Y1_J'], 'Z1': ['Z1_A', 'Z1_B', 'Z1_C', 'Z1_D', 'Z1_E', 'Z1_F', 'Z1_G', 'Z1_H', 'Z1_I', 'Z1_J'], 'X2': ['X2_A', 'X2_B', 'X2_C', 'X2_D', 'X2_E', 'X2_F', 'X2_G', 'X2_H', 'X2_I', 'X2_J'], 'Y2': ['Y2_A', 'Y2_B', 'Y2_C', 'Y2_D', 'Y2_E', 'Y2_F', 'Y2_G', 'Y2_H', 'Y2_I', 'Y2_J'], 'Z2': ['Z2_A', 'Z2_B', 'Z2_C', 'Z2_D', 'Z2_E', 'Z2_F', 'Z2_G', 'Z2_H', 'Z2_I', 'Z2_J']}

{'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}


In [15]:
for idx, (seq, cluster_cnt) in enumerate(sequence_cluster_cnts.items()):
    
    # _perform k means clustering on subsequences
    seq_clusters = get_cluster_ids(np_sequences[idx], cluster_cnt)
    cluster_words = words_dict[seq][:cluster_cnt]
    
    print(f'{idx} -- {seq} -- {cluster_cnt} -- {cluster_words} -- {set(seq_clusters)}')
    
    # _get assigned words and fill the values in doc_df
    assigned_words = get_assigned_words(seq_clusters, cluster_words)
    doc_df[seq] = assigned_words

0 -- X1 -- 3 -- ['X1_A', 'X1_B', 'X1_C'] -- {0, 1, 2}
1 -- Y1 -- 4 -- ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'] -- {0, 1, 2, 3}
2 -- Z1 -- 3 -- ['Z1_A', 'Z1_B', 'Z1_C'] -- {0, 1, 2}
3 -- X2 -- 3 -- ['X2_A', 'X2_B', 'X2_C'] -- {0, 1, 2}
4 -- Y2 -- 3 -- ['Y2_A', 'Y2_B', 'Y2_C'] -- {0, 1, 2}
5 -- Z2 -- 3 -- ['Z2_A', 'Z2_B', 'Z2_C'] -- {0, 1, 2}


doc_df

In [16]:
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
1,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
2,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
3,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
4,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
...,...,...,...,...,...,...,...
57619,3,X1_A,Y1_A,Z1_A,X2_B,Y2_A,Z2_A
57620,3,X1_A,Y1_A,Z1_A,X2_B,Y2_A,Z2_A
57621,3,X1_A,Y1_B,Z1_A,X2_B,Y2_A,Z2_A
57622,3,X1_A,Y1_B,Z1_A,X2_B,Y2_A,Z2_A


# Finding the statistics of the clusters(mean,variance,skewness)


In [17]:
def clust_subseq(X,i):
    d1=pd.DataFrame(doc_df[X])
    X1_df = pd.DataFrame(np_sequences[i])
    df=pd.concat([d1,X1_df],axis = 1)
    df.columns =['Col_1', 'Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    return df

In [18]:

cluster_subseq =[]
di = {'X1':0,'Y1':1,'Z1':2,'X2':3,'Y2':4,'Z2':5}

for key,value in di.items():    
    last = clust_subseq(key,value)
    cluster_subseq.append(last)
print(cluster_subseq)


[      Col_1    Col_2    Col_3    Col_4    Col_5    Col_6    Col_7    Col_8  \
0      X1_A  2.21530  2.29196  2.29090  2.21800  2.30106  2.07165  2.41148   
1      X1_A  2.07165  2.41148  2.32815  2.25096  2.14107  2.36727  2.43617   
2      X1_A  2.36727  2.43617  2.28661  2.21585  2.24722  2.24615  2.13466   
3      X1_A  2.24615  2.13466  2.23039  2.24697  2.14778  2.11880  2.26540   
4      X1_A  2.11880  2.26540  2.16379  2.15630  2.38759  2.26407  2.30000   
...     ...      ...      ...      ...      ...      ...      ...      ...   
57619  X1_A -1.88180 -2.02226 -2.34660 -2.25873 -2.36997 -2.44341 -2.45196   
57620  X1_A -2.44341 -2.45196 -2.22790 -2.22763 -2.23511 -2.13404 -2.20240   
57621  X1_A -2.13404 -2.20240 -2.13324 -2.13378 -2.21363 -2.21443 -2.13270   
57622  X1_A -2.21443 -2.13270 -2.16862 -2.06862 -2.07183 -2.03003 -1.75508   
57623  X1_A -2.03003 -1.75508 -1.83386 -1.79260 -2.00730 -1.97272 -2.12682   

         Col_9   Col_10   Col_11  
0      2.32815  2.25096  2.

In [19]:
def clusterstatistics(Y,X):
    
    cluster_df = Y.loc[(Y['Col_1'] == X)] 
    
    Columns = ['Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    Mean = []
    Variance = []
    Skewness = []

    for values in Columns:
        M = cluster_df [values].mean()
        V = cluster_df[values].var()
        S = cluster_df[values].skew()
        Mean.append(M)
        Variance.append(V)
        Skewness.append(S)
    
    stat_df=pd.DataFrame(Mean,columns=["Mean"])
    stat_df["Variance"]=Variance
    stat_df["Skewness"]=Skewness
    return  stat_df



In [20]:
statistic=[]
dic = {0:['X1_A', 'X1_B', 'X1_C'],
       1:['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'],
       2:['Z1_A', 'Z1_B', 'Z1_C'],
       3:['X2_A', 'X2_B', 'X2_C'],
       4:['Y2_A', 'Y2_B', 'Y2_C'],
       5:['Z2_A', 'Z2_B', 'Z2_C']
      }
for i in range(len(cluster_subseq)):
    cluster_names=dic[i]
    for j in range(len(cluster_names)):
        last1 = clusterstatistics(cluster_subseq[i],cluster_names[j])
        statistic.append(last1)
print(statistic)

[       Mean  Variance  Skewness
0 -0.851347  2.624622  0.004423
1 -0.850585  2.640056  0.295680
2 -0.851709  2.591998  0.076649
3 -0.852774  2.562560  0.011090
4 -0.851001  2.548295  0.028422
5 -0.850043  2.549296  0.027452
6 -0.851028  2.562067  0.040179
7 -0.851836  2.574836  0.029221
8 -0.852389  2.598835  0.014165
9 -0.851441  2.626608  0.023876,        Mean  Variance   Skewness
0 -8.475613  0.865696   0.423850
1 -8.478195  0.854038   0.034498
2 -8.474542  1.097363  17.436424
3 -8.482760  1.017682 -12.941729
4 -8.483910  0.918677  -5.718762
5 -8.482379  0.814175   0.005774
6 -8.479945  0.901088   3.055283
7 -8.473883  1.128625  17.415954
8 -8.479888  1.041167 -12.365160
9 -8.477197  0.970646  -4.936737,        Mean  Variance  Skewness
0  6.316558  1.902967 -0.500166
1  6.319447  1.884511 -0.473934
2  6.320733  1.868509 -0.472502
3  6.321930  1.863696 -0.451738
4  6.322995  1.860987 -0.419877
5  6.322123  1.863703 -0.435085
6  6.321976  1.868986 -0.446954
7  6.320320  1.873176 -0.4

In [21]:
allvalues=[]
for i in range(len(statistic)):
    l=statistic[i].T
    allvalues.append(l)
print(allvalues)

type(allvalues[0])

[                 0         1         2         3         4         5  \
Mean     -0.851347 -0.850585 -0.851709 -0.852774 -0.851001 -0.850043   
Variance  2.624622  2.640056  2.591998  2.562560  2.548295  2.549296   
Skewness  0.004423  0.295680  0.076649  0.011090  0.028422  0.027452   

                 6         7         8         9  
Mean     -0.851028 -0.851836 -0.852389 -0.851441  
Variance  2.562067  2.574836  2.598835  2.626608  
Skewness  0.040179  0.029221  0.014165  0.023876  ,                  0         1          2          3         4         5  \
Mean     -8.475613 -8.478195  -8.474542  -8.482760 -8.483910 -8.482379   
Variance  0.865696  0.854038   1.097363   1.017682  0.918677  0.814175   
Skewness  0.423850  0.034498  17.436424 -12.941729 -5.718762  0.005774   

                 6          7          8         9  
Mean     -8.479945  -8.473883  -8.479888 -8.477197  
Variance  0.901088   1.128625   1.041167  0.970646  
Skewness  3.055283  17.415954 -12.365160 -4.93673

pandas.core.frame.DataFrame

In [23]:
allvalues1 = []
for i in range(len(allvalues)):
    aa = pd.DataFrame(allvalues[i].stack().to_frame().values).T
    allvalues1.append(aa)
allvalues1

[         0         1         2         3         4         5         6   \
 0 -0.851347 -0.850585 -0.851709 -0.852774 -0.851001 -0.850043 -0.851028   
 
          7         8         9   ...        20       21        22       23  \
 0 -0.851836 -0.852389 -0.851441  ...  0.004423  0.29568  0.076649  0.01109   
 
          24        25        26        27        28        29  
 0  0.028422  0.027452  0.040179  0.029221  0.014165  0.023876  
 
 [1 rows x 30 columns],
          0         1         2        3        4         5         6   \
 0 -8.475613 -8.478195 -8.474542 -8.48276 -8.48391 -8.482379 -8.479945   
 
          7         8         9   ...       20        21         22         23  \
 0 -8.473883 -8.479888 -8.477197  ...  0.42385  0.034498  17.436424 -12.941729   
 
          24        25        26         27        28        29  
 0 -5.718762  0.005774  3.055283  17.415954 -12.36516 -4.936737  
 
 [1 rows x 30 columns],
          0         1         2        3         4      

In [24]:
embeddings_filepath = os.getcwd() + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
pd.concat(allvalues1).to_csv(embeddings_filepath, index=False, header= False)

In [None]:
doc_df.drop_duplicates(keep=False, inplace=True)
doc_df

In [None]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

In [None]:
# _save the combined values to text files
for activity in doc_df['activityID'].unique():
    output_filepath = os.getcwd() + f'/../data/sub_sequence_output/activity_subseq_' + str(activity) + '.txt'
    doc_df.loc[doc_df['activityID'] == activity][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)