In [1]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../../data/processed_data.csv'
subject101_file_path = os.getcwd() + f'/../../data/output_csv/subject101_processed.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path, header=None, names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,0.418461,0.925518,0.472759,-0.035942,0.138344,-0.011430
1,37.67,1,0.432534,0.752532,0.514698,-0.556016,0.093168,-0.006729
2,37.68,1,0.432340,0.601171,0.535716,-0.763257,0.039907,0.023608
3,37.69,1,0.418957,0.601528,0.555967,-0.622059,0.069178,0.060307
4,37.70,1,0.434205,0.634318,0.607554,-0.239073,-0.070386,0.034582
...,...,...,...,...,...,...,...,...
288214,974.50,3,-0.356667,0.023598,1.269533,-0.233579,-0.058947,-0.165780
288215,974.51,3,-0.357490,0.101736,1.209994,-0.369190,0.018471,-0.029929
288216,974.52,3,-0.350256,0.068348,1.257699,-0.585796,-0.012709,-0.046238
288217,974.53,3,-0.353363,-0.022715,1.115753,-0.712105,0.084094,-0.053992


In [4]:
subject101_df = pd.read_csv(subject101_file_path, names=col_names, skiprows=1)
subject101_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,0.418461,0.925518,0.472759,-0.035942,0.138344,-0.011430
1,37.67,1,0.432534,0.752532,0.514698,-0.556016,0.093168,-0.006729
2,37.68,1,0.432340,0.601171,0.535716,-0.763257,0.039907,0.023608
3,37.69,1,0.418957,0.601528,0.555967,-0.622059,0.069178,0.060307
4,37.70,1,0.434205,0.634318,0.607554,-0.239073,-0.070386,0.034582
...,...,...,...,...,...,...,...,...
72331,761.45,3,0.444840,0.676110,0.463848,-0.931057,0.355296,-0.051148
72332,761.46,3,0.446754,0.612022,0.556538,-1.051936,0.397559,-0.107791
72333,761.47,3,0.410890,0.514726,0.525248,-0.428415,0.275692,-0.130117
72334,761.48,3,0.410742,0.752398,0.473136,0.136853,0.130708,0.022278


# Generating subsequences for each sequence of the data

In [5]:
def elbow_techique(sub_sequence_data):
    
    # _only for manual testing
    distortions = []
    for k in range(1,10):
        kmeanModel = KMeans(n_clusters=k)
        kmeanModel.fit(sub_sequence_data)
        distortions.append(kmeanModel.inertia_)

    # _plot the distortions to observe the elbow point from the graph
    plt.figure(figsize=(16,8))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [6]:
def get_cluster_ids(sub_sequence_data, cluster_cnt):
    
    # _passing random state as an argument to make Kmeans clustering deterministic
    kMeans_model = KMeans(n_clusters = cluster_cnt,random_state=1234).fit(sub_sequence_data)
    return kMeans_model

In [7]:
def get_assigned_words(seq_clusters, cluster_words):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
            
    return assigned_words

In [8]:
def generating_numbers(sequence_names):
    
    # _each word contains common prefix of subsequence and unique cluster alphabet
    alphabet = 'ABCDEFGHIJ'
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+alphabet[i] for i in range(len(alphabet))]
        
    return words_dict

In [9]:
# _initializing variables
window_length = 10
window_overlap = 5

sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]
sub_sequences_101 = [[] for x in range(num_of_subsequences)]

In [10]:
window_index = 0
max_window_index = len(main_df.index)

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())
            
    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)

(6, 57624, 10)


In [11]:
window_index = 0
max_window_index = len(subject101_df.index)

while window_index <= (max_window_index - window_length):

    activity_sequence = subject101_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences_101[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences_101[idx].append(subject101_df[sequence_names[idx]][window_index:window_index+window_length].tolist())
            
    window_index += window_overlap

# _converting into numpy arrays
np_sequences_101 = np.asarray(sub_sequences_101[1:])
print(np_sequences_101.shape)

(6, 14463, 10)


# Finding the statistics of the clusters(mean,variance,skewness)


In [12]:
# _document dataframe to store word assignments of each window
doc_df = pd.DataFrame(columns=col_names[1:])
doc_df['activityID'] = sub_sequences_101[0]
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,,,,,,
1,1,,,,,,
2,1,,,,,,
3,1,,,,,,
4,1,,,,,,
...,...,...,...,...,...,...,...
14458,3,,,,,,
14459,3,,,,,,
14460,3,,,,,,
14461,3,,,,,,


In [13]:
words_dict = generating_numbers(sequence_names[1:])
print(words_dict)
print()

# _after observing the clustering result with elbow method, we are manually choosing the n_clusters value.
# _testing: elbow_techique(np_sequences[0])
sequence_cluster_cnts = {'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}
print(sequence_cluster_cnts)

{'X1': ['X1_A', 'X1_B', 'X1_C', 'X1_D', 'X1_E', 'X1_F', 'X1_G', 'X1_H', 'X1_I', 'X1_J'], 'Y1': ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D', 'Y1_E', 'Y1_F', 'Y1_G', 'Y1_H', 'Y1_I', 'Y1_J'], 'Z1': ['Z1_A', 'Z1_B', 'Z1_C', 'Z1_D', 'Z1_E', 'Z1_F', 'Z1_G', 'Z1_H', 'Z1_I', 'Z1_J'], 'X2': ['X2_A', 'X2_B', 'X2_C', 'X2_D', 'X2_E', 'X2_F', 'X2_G', 'X2_H', 'X2_I', 'X2_J'], 'Y2': ['Y2_A', 'Y2_B', 'Y2_C', 'Y2_D', 'Y2_E', 'Y2_F', 'Y2_G', 'Y2_H', 'Y2_I', 'Y2_J'], 'Z2': ['Z2_A', 'Z2_B', 'Z2_C', 'Z2_D', 'Z2_E', 'Z2_F', 'Z2_G', 'Z2_H', 'Z2_I', 'Z2_J']}

{'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}


In [14]:
for idx, (seq, cluster_cnt) in enumerate(sequence_cluster_cnts.items()):
    
    # _perform k means clustering on subsequences
    KMeans_models = get_cluster_ids(np_sequences[idx], cluster_cnt)
    seq_clusters = KMeans_models.predict(np_sequences_101[idx])
    
    cluster_words = words_dict[seq][:cluster_cnt]
    
    print(f'{idx} -- {seq} -- {cluster_cnt} -- {cluster_words} -- {set(seq_clusters)}')
    
    # _get assigned words and fill the values in doc_df
    assigned_words = get_assigned_words(seq_clusters, cluster_words)
    doc_df[seq] = assigned_words

0 -- X1 -- 3 -- ['X1_A', 'X1_B', 'X1_C'] -- {0, 1, 2}
1 -- Y1 -- 4 -- ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'] -- {0, 1, 2, 3}
2 -- Z1 -- 3 -- ['Z1_A', 'Z1_B', 'Z1_C'] -- {0, 1, 2}
3 -- X2 -- 3 -- ['X2_A', 'X2_B', 'X2_C'] -- {0, 1, 2}
4 -- Y2 -- 3 -- ['Y2_A', 'Y2_B', 'Y2_C'] -- {0, 1, 2}
5 -- Z2 -- 3 -- ['Z2_A', 'Z2_B', 'Z2_C'] -- {0, 1, 2}


doc_df

In [15]:
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
1,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
2,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
3,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
4,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
...,...,...,...,...,...,...,...
14458,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
14459,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
14460,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A
14461,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A


In [16]:
# doc_df.drop_duplicates(keep=False, inplace=True)
# doc_df

# Finding the statistics of the clusters(mean,variance,skewness)

In [17]:
def clust_subseq(X,i):
    d1=pd.DataFrame(doc_df[X])
    X1_df = pd.DataFrame(np_sequences_101[i])
    df=pd.concat([d1,X1_df],axis = 1)
    df.columns =['Col_1', 'Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    return df

In [18]:
cluster_subseq =[]
di = {'X1':0,'Y1':1,'Z1':2,'X2':3,'Y2':4,'Z2':5}

for key,value in di.items():    
    last = clust_subseq(key,value)
    cluster_subseq.append(last)

In [19]:
def clusterstatistics(Y,X):
    
    cluster_df = Y.loc[(Y['Col_1'] == X)] 
    
    Columns = ['Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    Mean = []
    Variance = []
    Skewness = []

    for values in Columns:
        M = cluster_df [values].mean()
        V = cluster_df[values].var()
        S = cluster_df[values].skew()
        Mean.append(M)
        Variance.append(V)
        Skewness.append(S)
    
    stat_df=pd.DataFrame(Mean,columns=["Mean"])
    stat_df["Variance"]=Variance
    stat_df["Skewness"]=Skewness
    return  stat_df

In [20]:
statistic=[]
dic = {0:['X1_A', 'X1_B', 'X1_C'],
       1:['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'],
       2:['Z1_A', 'Z1_B', 'Z1_C'],
       3:['X2_A', 'X2_B', 'X2_C'],
       4:['Y2_A', 'Y2_B', 'Y2_C'],
       5:['Z2_A', 'Z2_B', 'Z2_C']
      }
for i in range(len(cluster_subseq)):
    cluster_names=dic[i]
    for j in range(len(cluster_names)):
        last1 = clusterstatistics(cluster_subseq[i],cluster_names[j])
        statistic.append(last1)
print(statistic)

[       Mean  Variance  Skewness
0  1.121651  0.069009 -0.600459
1  1.121975  0.068537 -0.579250
2  1.121962  0.068753 -0.590152
3  1.122712  0.068856 -0.601280
4  1.122508  0.068475 -0.597325
5  1.122362  0.068309 -0.587739
6  1.122319  0.068196 -0.573623
7  1.122089  0.068639 -0.588734
8  1.122535  0.069007 -0.602950
9  1.121944  0.069008 -0.607456,        Mean  Variance  Skewness
0  0.175893  0.060322 -0.697646
1  0.175536  0.059709 -0.689234
2  0.175577  0.059357 -0.688592
3  0.175417  0.059257 -0.677341
4  0.176178  0.059038 -0.674177
5  0.176008  0.059002 -0.674588
6  0.175570  0.059094 -0.678779
7  0.175427  0.059317 -0.695238
8  0.175138  0.059930 -0.697477
9  0.176020  0.060175 -0.693629,        Mean  Variance  Skewness
0 -1.404226  0.063605  2.191943
1 -1.404067  0.062689  2.172595
2 -1.404622  0.062188  2.168900
3 -1.404601  0.061741  2.182440
4 -1.404892  0.061822  2.170551
5 -1.405124  0.062093  2.157792
6 -1.404472  0.062015  2.158541
7 -1.404539  0.062373  2.179420
8 -1.

In [21]:
allvalues=[]
for i in range(len(statistic)):
    l=statistic[i].T
    allvalues.append(l)
    
allvalues1 = []
for i in range(len(allvalues)):
    aa = pd.DataFrame(allvalues[i].stack().to_frame().values).T
    allvalues1.append(aa)

In [22]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2,final_sub_sequence
0,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
1,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
2,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
3,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
4,1,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
...,...,...,...,...,...,...,...,...
14458,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
14459,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
14460,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A
14461,3,X1_B,Y1_A,Z1_A,X2_A,Y2_A,Z2_A,X1_B Y1_A Z1_A X2_A Y2_A Z2_A


In [23]:
# _save the combined values to text files
for activity in doc_df['activityID'].unique():
    output_filepath = os.getcwd() + f'/../../data/sub_sequence_output/activity_subseq_' + str(activity) + '.txt'
    doc_df.loc[doc_df['activityID'] == activity][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)