In [1]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats 

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/processed_data.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path, header=None, names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,0.418461,0.925518,0.472759,-0.035942,0.138344,-0.011430
1,37.67,1,0.432534,0.752532,0.514698,-0.556016,0.093168,-0.006729
2,37.68,1,0.432340,0.601171,0.535716,-0.763257,0.039907,0.023608
3,37.69,1,0.418957,0.601528,0.555967,-0.622059,0.069178,0.060307
4,37.70,1,0.434205,0.634318,0.607554,-0.239073,-0.070386,0.034582
...,...,...,...,...,...,...,...,...
288214,974.50,3,-0.356667,0.023598,1.269533,-0.233579,-0.058947,-0.165780
288215,974.51,3,-0.357490,0.101736,1.209994,-0.369190,0.018471,-0.029929
288216,974.52,3,-0.350256,0.068348,1.257699,-0.585796,-0.012709,-0.046238
288217,974.53,3,-0.353363,-0.022715,1.115753,-0.712105,0.084094,-0.053992


In [4]:
print(pd.Series(main_df['timestamp (s)']).is_unique)

#performing normalization for X1,Y1,Z1,X2,Y2,Z2

def normalization(X):   
        normalized_values=(main_df[X]-main_df[X].mean())/main_df[X].std()
        return normalized_values

False


In [5]:
normalization('X1')

0         0.418460
1         0.432534
2         0.432339
3         0.418956
4         0.434204
            ...   
288214   -0.356666
288215   -0.357490
288216   -0.350256
288217   -0.353362
288218   -0.380055
Name: X1, Length: 288219, dtype: float64

# Generating subsequences for each sequence of the data

In [6]:
def elbow_techique(sub_sequence_data):
    
    # _only for manual testing
    distortions = []
    for k in range(1,10):
        kmeanModel = KMeans(n_clusters=k)
        kmeanModel.fit(sub_sequence_data)
        distortions.append(kmeanModel.inertia_)

    # _plot the distortions to observe the elbow point from the graph
    plt.figure(figsize=(16,8))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [7]:
def get_cluster_ids(sub_sequence_data, cluster_cnt):
    
    # _passing random state as an argument to make Kmeans clustering deterministic
    kmeans_model = KMeans(n_clusters = cluster_cnt,random_state=1234)
    return kmeans_model.fit_predict(sub_sequence_data)

In [8]:
def get_assigned_words(seq_clusters, cluster_words):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
            
    return assigned_words

In [9]:
def generating_numbers(sequence_names):
    
    # _each word contains common prefix of subsequence and unique cluster alphabet
    alphabet = 'ABCDEFGHIJ'
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+alphabet[i] for i in range(len(alphabet))]
        
    return words_dict

In [10]:
# _initializing variables
window_length = 10
window_overlap = 5
max_window_index = len(main_df.index)
sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [11]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)



(6, 57624, 10)


In [12]:
np_sequences


array([[[ 0.41846107,  0.4325343 ,  0.43233971, ...,  0.43917806,
          0.42500753,  0.40483393],
        [ 0.39208982,  0.45447577,  0.43917806, ...,  0.43155215,
          0.41856204,  0.42432094],
        [ 0.44635971,  0.45900836,  0.43155215, ...,  0.42123129,
          0.42427504,  0.40606575],
        ...,
        [-0.3799194 , -0.39159868, -0.37978272, ..., -0.38582738,
         -0.36874242, -0.36929084],
        [-0.393654  , -0.37969046, -0.38582738, ..., -0.32863375,
         -0.3215845 , -0.35826592],
        [-0.36214933, -0.31517422, -0.32863375, ..., -0.36403722,
         -0.38226346, -0.35666676]],

       [[ 0.92551774,  0.75253208,  0.60117141, ...,  0.74152699,
          0.78480265,  0.70965873],
        [ 0.63462609,  0.73110967,  0.74152699, ...,  0.75214974,
          0.7310469 ,  0.68728047],
        [ 0.74154125,  0.74089641,  0.75214974, ...,  0.54559812,
          0.51436614,  0.49401942],
        ...,
        [-0.03390147,  0.04530534,  0.0558702 , ..., -

In [13]:
# _document dataframe to store word assignments of each window
doc_df = pd.DataFrame(columns=col_names[1:])
doc_df['activityID'] = sub_sequences[0]
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,,,,,,
1,1,,,,,,
2,1,,,,,,
3,1,,,,,,
4,1,,,,,,
...,...,...,...,...,...,...,...
57619,3,,,,,,
57620,3,,,,,,
57621,3,,,,,,
57622,3,,,,,,


In [14]:
words_dict = generating_numbers(sequence_names[1:])
print(words_dict)
print()

# _after observing the clustering result with elbow method, we are manually choosing the n_clusters value.
# _testing: elbow_techique(np_sequences[0])
sequence_cluster_cnts = {'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}
print(sequence_cluster_cnts)

{'X1': ['X1_A', 'X1_B', 'X1_C', 'X1_D', 'X1_E', 'X1_F', 'X1_G', 'X1_H', 'X1_I', 'X1_J'], 'Y1': ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D', 'Y1_E', 'Y1_F', 'Y1_G', 'Y1_H', 'Y1_I', 'Y1_J'], 'Z1': ['Z1_A', 'Z1_B', 'Z1_C', 'Z1_D', 'Z1_E', 'Z1_F', 'Z1_G', 'Z1_H', 'Z1_I', 'Z1_J'], 'X2': ['X2_A', 'X2_B', 'X2_C', 'X2_D', 'X2_E', 'X2_F', 'X2_G', 'X2_H', 'X2_I', 'X2_J'], 'Y2': ['Y2_A', 'Y2_B', 'Y2_C', 'Y2_D', 'Y2_E', 'Y2_F', 'Y2_G', 'Y2_H', 'Y2_I', 'Y2_J'], 'Z2': ['Z2_A', 'Z2_B', 'Z2_C', 'Z2_D', 'Z2_E', 'Z2_F', 'Z2_G', 'Z2_H', 'Z2_I', 'Z2_J']}

{'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}


In [15]:
for idx, (seq, cluster_cnt) in enumerate(sequence_cluster_cnts.items()):
    
    # _perform k means clustering on subsequences
    seq_clusters = get_cluster_ids(np_sequences[idx], cluster_cnt)
    cluster_words = words_dict[seq][:cluster_cnt]
    
    print(f'{idx} -- {seq} -- {cluster_cnt} -- {cluster_words} -- {set(seq_clusters)}')
    
    # _get assigned words and fill the values in doc_df
    assigned_words = get_assigned_words(seq_clusters, cluster_words)
    doc_df[seq] = assigned_words

0 -- X1 -- 3 -- ['X1_A', 'X1_B', 'X1_C'] -- {0, 1, 2}
1 -- Y1 -- 4 -- ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'] -- {0, 1, 2, 3}
2 -- Z1 -- 3 -- ['Z1_A', 'Z1_B', 'Z1_C'] -- {0, 1, 2}
3 -- X2 -- 3 -- ['X2_A', 'X2_B', 'X2_C'] -- {0, 1, 2}
4 -- Y2 -- 3 -- ['Y2_A', 'Y2_B', 'Y2_C'] -- {0, 1, 2}
5 -- Z2 -- 3 -- ['Z2_A', 'Z2_B', 'Z2_C'] -- {0, 1, 2}


doc_df

In [16]:
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
1,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
2,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
3,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
4,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
...,...,...,...,...,...,...,...
57619,3,X1_B,Y1_B,Z1_A,X2_A,Y2_A,Z2_A
57620,3,X1_B,Y1_B,Z1_A,X2_A,Y2_A,Z2_A
57621,3,X1_B,Y1_B,Z1_A,X2_A,Y2_A,Z2_A
57622,3,X1_B,Y1_B,Z1_A,X2_A,Y2_A,Z2_A


# Finding the statistics of the clusters(mean,variance,skewness)


In [17]:
def clust_subseq(X,i):
    d1=pd.DataFrame(doc_df[X])
    X1_df = pd.DataFrame(np_sequences[i])
    df=pd.concat([d1,X1_df],axis = 1)
    df.columns =['Col_1', 'Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    return df

In [18]:

cluster_subseq =[]
di = {'X1':0,'Y1':1,'Z1':2,'X2':3,'Y2':4,'Z2':5}

for key,value in di.items():    
    last = clust_subseq(key,value)
    cluster_subseq.append(last)
print(cluster_subseq)


[      Col_1     Col_2     Col_3     Col_4     Col_5     Col_6     Col_7  \
0      X1_B  0.418461  0.432534  0.432340  0.418957  0.434205  0.392090   
1      X1_B  0.392090  0.454476  0.439178  0.425008  0.404834  0.446360   
2      X1_B  0.446360  0.459008  0.431552  0.418562  0.424321  0.424125   
3      X1_B  0.424125  0.403657  0.421231  0.424275  0.406066  0.400746   
4      X1_B  0.400746  0.427658  0.409005  0.407630  0.450090  0.427414   
...     ...       ...       ...       ...       ...       ...       ...   
57619  X1_B -0.336824 -0.360822 -0.416235 -0.401223 -0.420228 -0.432775   
57620  X1_B -0.432775 -0.434236 -0.395955 -0.395909 -0.397187 -0.379919   
57621  X1_B -0.379919 -0.391599 -0.379783 -0.379875 -0.393517 -0.393654   
57622  X1_B -0.393654 -0.379690 -0.385827 -0.368742 -0.369291 -0.362149   
57623  X1_B -0.362149 -0.315174 -0.328634 -0.321584 -0.358266 -0.352358   

          Col_8     Col_9    Col_10    Col_11  
0      0.454476  0.439178  0.425008  0.404834  
1 

In [19]:
def clusterstatistics(Y,X):
    
    cluster_df = Y.loc[(Y['Col_1'] == X)] 
    
    Columns = ['Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    Mean = []
    Variance = []
    Skewness = []
    IQR = []

    for values in Columns:
        M = cluster_df [values].mean()
        V = cluster_df[values].var()
        S = cluster_df[values].skew()
        Iqr = stats.iqr(cluster_df [values], interpolation = 'midpoint') 
        
        Mean.append(M)
        Variance.append(V)
        Skewness.append(S)
        IQR.append(Iqr)
    
    stat_df=pd.DataFrame(Mean,columns=["Mean"])
    stat_df["Variance"]=Variance
    stat_df["Skewness"]=Skewness
    stat_df["IQR"]=IQR

    return  stat_df



In [20]:
statistic=[]
dic = {0:['X1_A', 'X1_B', 'X1_C'],
       1:['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'],
       2:['Z1_A', 'Z1_B', 'Z1_C'],
       3:['X2_A', 'X2_B', 'X2_C'],
       4:['Y2_A', 'Y2_B', 'Y2_C'],
       5:['Z2_A', 'Z2_B', 'Z2_C']
      }
for i in range(len(cluster_subseq)):
    cluster_names=dic[i]
    for j in range(len(cluster_names)):
        last1 = clusterstatistics(cluster_subseq[i],cluster_names[j])
        statistic.append(last1)
print(statistic)

[       Mean  Variance  Skewness       IQR
0  1.274679  0.039326 -1.303317  0.198075
1  1.275477  0.039357 -0.934785  0.198303
2  1.275563  0.038513 -1.226224  0.198435
3  1.275532  0.038408 -1.255802  0.198209
4  1.275592  0.038496 -1.258671  0.198316
5  1.275570  0.038260 -1.238674  0.198074
6  1.275712  0.038157 -1.223698  0.198302
7  1.275572  0.038244 -1.251014  0.198391
8  1.275309  0.038620 -1.258528  0.198210
9  1.274983  0.039048 -1.253927  0.198302,        Mean  Variance  Skewness       IQR
0  0.036291  0.055739 -0.107331  0.232987
1  0.036255  0.054877 -0.067490  0.232980
2  0.036660  0.058625  2.971029  0.231496
3  0.035840  0.058886 -3.826344  0.232412
4  0.036400  0.056353 -1.744062  0.231969
5  0.036833  0.053376 -0.057828  0.232821
6  0.036750  0.054547  0.299809  0.232699
7  0.036350  0.054257 -0.005810  0.231499
8  0.036083  0.054596 -0.094187  0.232412
9  0.036253  0.055372 -0.077001  0.231962,        Mean  Variance  Skewness       IQR
0 -1.206496  0.068277 -0.099735

In [21]:
allvalues=[]
for i in range(len(statistic)):
    l=statistic[i].T
    allvalues.append(l)
print(allvalues)

type(allvalues[0])

[                 0         1         2         3         4         5  \
Mean      1.274679  1.275477  1.275563  1.275532  1.275592  1.275570   
Variance  0.039326  0.039357  0.038513  0.038408  0.038496  0.038260   
Skewness -1.303317 -0.934785 -1.226224 -1.255802 -1.258671 -1.238674   
IQR       0.198075  0.198303  0.198435  0.198209  0.198316  0.198074   

                 6         7         8         9  
Mean      1.275712  1.275572  1.275309  1.274983  
Variance  0.038157  0.038244  0.038620  0.039048  
Skewness -1.223698 -1.251014 -1.258528 -1.253927  
IQR       0.198302  0.198391  0.198210  0.198302  ,                  0         1         2         3         4         5  \
Mean      0.036291  0.036255  0.036660  0.035840  0.036400  0.036833   
Variance  0.055739  0.054877  0.058625  0.058886  0.056353  0.053376   
Skewness -0.107331 -0.067490  2.971029 -3.826344 -1.744062 -0.057828   
IQR       0.232987  0.232980  0.231496  0.232412  0.231969  0.232821   

                 6   

pandas.core.frame.DataFrame

In [22]:
allvalues1 = []
for i in range(len(allvalues)):
    aa = pd.DataFrame(allvalues[i].stack().to_frame().values).T
    allvalues1.append(aa)
allvalues1

[         0         1         2         3         4        5         6   \
 0  1.274679  1.275477  1.275563  1.275532  1.275592  1.27557  1.275712   
 
          7         8         9   ...        30        31        32        33  \
 0  1.275572  1.275309  1.274983  ...  0.198075  0.198303  0.198435  0.198209   
 
          34        35        36        37       38        39  
 0  0.198316  0.198074  0.198302  0.198391  0.19821  0.198302  
 
 [1 rows x 40 columns],
          0         1        2        3       4         5        6        7   \
 0  0.036291  0.036255  0.03666  0.03584  0.0364  0.036833  0.03675  0.03635   
 
          8         9   ...        30       31        32        33        34  \
 0  0.036083  0.036253  ...  0.232987  0.23298  0.231496  0.232412  0.231969   
 
          35        36        37        38        39  
 0  0.232821  0.232699  0.231499  0.232412  0.231962  
 
 [1 rows x 40 columns],
          0         1         2         3         4         5         

In [23]:
embeddings_filepath = os.getcwd() + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
pd.concat(allvalues1).to_csv(embeddings_filepath, index=False, header= False)

In [24]:
doc_df.drop_duplicates(keep=False, inplace=True)
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
179,1,X1_B,Y1_B,Z1_B,X2_C,Y2_B,Z2_A
180,1,X1_B,Y1_B,Z1_B,X2_C,Y2_B,Z2_B
188,1,X1_C,Y1_D,Z1_B,X2_C,Y2_B,Z2_B
192,1,X1_C,Y1_C,Z1_B,X2_C,Y2_A,Z2_B
206,1,X1_C,Y1_D,Z1_A,X2_B,Y2_A,Z2_C
...,...,...,...,...,...,...,...
55312,3,X1_A,Y1_C,Z1_A,X2_C,Y2_A,Z2_A
55406,3,X1_A,Y1_D,Z1_B,X2_B,Y2_A,Z2_A
55407,3,X1_A,Y1_B,Z1_B,X2_B,Y2_A,Z2_A
55433,3,X1_B,Y1_B,Z1_B,X2_A,Y2_A,Z2_B


In [25]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2,final_sub_sequence
179,1,X1_B,Y1_B,Z1_B,X2_C,Y2_B,Z2_A,X1_B Y1_B Z1_B X2_C Y2_B Z2_A
180,1,X1_B,Y1_B,Z1_B,X2_C,Y2_B,Z2_B,X1_B Y1_B Z1_B X2_C Y2_B Z2_B
188,1,X1_C,Y1_D,Z1_B,X2_C,Y2_B,Z2_B,X1_C Y1_D Z1_B X2_C Y2_B Z2_B
192,1,X1_C,Y1_C,Z1_B,X2_C,Y2_A,Z2_B,X1_C Y1_C Z1_B X2_C Y2_A Z2_B
206,1,X1_C,Y1_D,Z1_A,X2_B,Y2_A,Z2_C,X1_C Y1_D Z1_A X2_B Y2_A Z2_C
...,...,...,...,...,...,...,...,...
55312,3,X1_A,Y1_C,Z1_A,X2_C,Y2_A,Z2_A,X1_A Y1_C Z1_A X2_C Y2_A Z2_A
55406,3,X1_A,Y1_D,Z1_B,X2_B,Y2_A,Z2_A,X1_A Y1_D Z1_B X2_B Y2_A Z2_A
55407,3,X1_A,Y1_B,Z1_B,X2_B,Y2_A,Z2_A,X1_A Y1_B Z1_B X2_B Y2_A Z2_A
55433,3,X1_B,Y1_B,Z1_B,X2_A,Y2_A,Z2_B,X1_B Y1_B Z1_B X2_A Y2_A Z2_B


In [26]:
# _save the combined values to text files
for activity in doc_df['activityID'].unique():
    output_filepath = os.getcwd() + f'/../data/sub_sequence_output/activity_subseq_' + str(activity) + '.txt'
    doc_df.loc[doc_df['activityID'] == activity][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)