In [1]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/processed_data.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path, header=None, names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,2.21530,8.27915,5.58753,-0.004750,0.037579,-0.011145
1,37.67,1,2.29196,7.67288,5.74467,-0.171710,0.025479,-0.009538
2,37.68,1,2.29090,7.14240,5.82342,-0.238241,0.011214,0.000831
3,37.69,1,2.21800,7.14365,5.89930,-0.192912,0.019053,0.013374
4,37.70,1,2.30106,7.25857,6.09259,-0.069961,-0.018328,0.004582
...,...,...,...,...,...,...,...,...
288214,974.50,3,-1.99794,3.94300,9.15686,-0.112651,-0.003501,-0.066523
288215,974.51,3,-2.00276,4.20689,8.96346,-0.179092,0.020300,-0.010637
288216,974.52,3,-1.96042,4.09413,9.11842,-0.285215,0.010714,-0.017346
288217,974.53,3,-1.97860,3.78659,8.65734,-0.347098,0.040476,-0.020536


In [4]:
print(pd.Series(main_df['timestamp (s)']).is_unique)

#performing normalization for X1,Y1,Z1,X2,Y2,Z2

def normalization(X):   
        normalized_values=(main_df[X]-main_df[X].mean())/main_df[X].std()
        return normalized_values

False


In [5]:
normalization('X1')

0         0.554688
1         0.567517
2         0.567339
3         0.555140
4         0.569040
            ...   
288214   -0.150352
288215   -0.151158
288216   -0.144073
288217   -0.147116
288218   -0.173261
Name: X1, Length: 288219, dtype: float64

# Generating subsequences for each sequence of the data

In [6]:
def elbow_techique(sub_sequence_data):
    
    # _only for manual testing
    distortions = []
    for k in range(1,10):
        kmeanModel = KMeans(n_clusters=k)
        kmeanModel.fit(sub_sequence_data)
        distortions.append(kmeanModel.inertia_)

    # _plot the distortions to observe the elbow point from the graph
    plt.figure(figsize=(16,8))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [7]:
def get_cluster_ids(sub_sequence_data, cluster_cnt):
    
    # _passing random state as an argument to make Kmeans clustering deterministic
    kmeans_model = KMeans(n_clusters = cluster_cnt,random_state=1234)
    return kmeans_model.fit_predict(sub_sequence_data)

In [8]:
def get_assigned_words(seq_clusters, cluster_words):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
            
    return assigned_words

In [9]:
def generating_numbers(sequence_names):
    
    # _each word contains common prefix of subsequence and unique cluster alphabet
    alphabet = 'ABCDEFGHIJ'
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+alphabet[i] for i in range(len(alphabet))]
        
    return words_dict

In [10]:
# _initializing variables
window_length = 10
window_overlap = 5
max_window_index = len(main_df.index)
sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [11]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)



(6, 57624, 10)


In [13]:
# _document dataframe to store word assignments of each window
doc_df = pd.DataFrame(columns=col_names[1:])
doc_df['activityID'] = sub_sequences[0]
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,,,,,,
1,1,,,,,,
2,1,,,,,,
3,1,,,,,,
4,1,,,,,,
...,...,...,...,...,...,...,...
57619,3,,,,,,
57620,3,,,,,,
57621,3,,,,,,
57622,3,,,,,,


In [14]:
words_dict = generating_numbers(sequence_names[1:])
print(words_dict)
print()

# _after observing the clustering result with elbow method, we are manually choosing the n_clusters value.
# _testing: elbow_techique(np_sequences[0])
sequence_cluster_cnts = {'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}
print(sequence_cluster_cnts)

{'X1': ['X1_A', 'X1_B', 'X1_C', 'X1_D', 'X1_E', 'X1_F', 'X1_G', 'X1_H', 'X1_I', 'X1_J'], 'Y1': ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D', 'Y1_E', 'Y1_F', 'Y1_G', 'Y1_H', 'Y1_I', 'Y1_J'], 'Z1': ['Z1_A', 'Z1_B', 'Z1_C', 'Z1_D', 'Z1_E', 'Z1_F', 'Z1_G', 'Z1_H', 'Z1_I', 'Z1_J'], 'X2': ['X2_A', 'X2_B', 'X2_C', 'X2_D', 'X2_E', 'X2_F', 'X2_G', 'X2_H', 'X2_I', 'X2_J'], 'Y2': ['Y2_A', 'Y2_B', 'Y2_C', 'Y2_D', 'Y2_E', 'Y2_F', 'Y2_G', 'Y2_H', 'Y2_I', 'Y2_J'], 'Z2': ['Z2_A', 'Z2_B', 'Z2_C', 'Z2_D', 'Z2_E', 'Z2_F', 'Z2_G', 'Z2_H', 'Z2_I', 'Z2_J']}

{'X1': 3, 'Y1': 4, 'Z1': 3, 'X2': 3, 'Y2': 3, 'Z2': 3}


In [15]:
for idx, (seq, cluster_cnt) in enumerate(sequence_cluster_cnts.items()):
    
    # _perform k means clustering on subsequences
    seq_clusters = get_cluster_ids(np_sequences[idx], cluster_cnt)
    cluster_words = words_dict[seq][:cluster_cnt]
    
    print(f'{idx} -- {seq} -- {cluster_cnt} -- {cluster_words} -- {set(seq_clusters)}')
    
    # _get assigned words and fill the values in doc_df
    assigned_words = get_assigned_words(seq_clusters, cluster_words)
    doc_df[seq] = assigned_words

0 -- X1 -- 3 -- ['X1_A', 'X1_B', 'X1_C'] -- {0, 1, 2}
1 -- Y1 -- 4 -- ['Y1_A', 'Y1_B', 'Y1_C', 'Y1_D'] -- {0, 1, 2, 3}
2 -- Z1 -- 3 -- ['Z1_A', 'Z1_B', 'Z1_C'] -- {0, 1, 2}
3 -- X2 -- 3 -- ['X2_A', 'X2_B', 'X2_C'] -- {0, 1, 2}
4 -- Y2 -- 3 -- ['Y2_A', 'Y2_B', 'Y2_C'] -- {0, 1, 2}
5 -- Z2 -- 3 -- ['Z2_A', 'Z2_B', 'Z2_C'] -- {0, 1, 2}


doc_df

In [16]:
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
1,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
2,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
3,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
4,1,X1_A,Y1_C,Z1_A,X2_B,Y2_A,Z2_A
...,...,...,...,...,...,...,...
57619,3,X1_A,Y1_A,Z1_A,X2_B,Y2_A,Z2_A
57620,3,X1_A,Y1_A,Z1_A,X2_B,Y2_A,Z2_A
57621,3,X1_A,Y1_B,Z1_A,X2_B,Y2_A,Z2_A
57622,3,X1_A,Y1_B,Z1_A,X2_B,Y2_A,Z2_A


# Finding the statistics of the clusters(mean,variance,skewness)


In [64]:
def clust_subseq(X,i):
    d1=pd.DataFrame(doc_df[X])
    X1_df = pd.DataFrame(np_sequences[i])
    df=pd.concat([d1,X1_df],axis = 1)
    df.columns =['Col_1', 'Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    return df

In [65]:
dfY1 = clust_subseq('Y1',0)
dfY1

Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,Col_11
0,Y1_C,2.21530,2.29196,2.29090,2.21800,2.30106,2.07165,2.41148,2.32815,2.25096,2.14107
1,Y1_C,2.07165,2.41148,2.32815,2.25096,2.14107,2.36727,2.43617,2.28661,2.21585,2.24722
2,Y1_C,2.36727,2.43617,2.28661,2.21585,2.24722,2.24615,2.13466,2.23039,2.24697,2.14778
3,Y1_C,2.24615,2.13466,2.23039,2.24697,2.14778,2.11880,2.26540,2.16379,2.15630,2.38759
4,Y1_C,2.11880,2.26540,2.16379,2.15630,2.38759,2.26407,2.30000,2.21613,2.21372,2.06309
...,...,...,...,...,...,...,...,...,...,...,...
57619,Y1_A,-1.88180,-2.02226,-2.34660,-2.25873,-2.36997,-2.44341,-2.45196,-2.22790,-2.22763,-2.23511
57620,Y1_A,-2.44341,-2.45196,-2.22790,-2.22763,-2.23511,-2.13404,-2.20240,-2.13324,-2.13378,-2.21363
57621,Y1_B,-2.13404,-2.20240,-2.13324,-2.13378,-2.21363,-2.21443,-2.13270,-2.16862,-2.06862,-2.07183
57622,Y1_B,-2.21443,-2.13270,-2.16862,-2.06862,-2.07183,-2.03003,-1.75508,-1.83386,-1.79260,-2.00730


In [69]:
def clusterstatistics(Y,X):
    
    cluster_df = Y.loc[(Y['Col_1'] == X)] 
    
    Columns = ['Col_2','Col_3','Col_4','Col_5','Col_6','Col_7','Col_8','Col_9','Col_10','Col_11'] 
    Mean = []
    Variance = []
    Skewness = []

    for values in Columns:
        M = cluster_df [values].mean()
        V = cluster_df[values].var()
        S = cluster_df[values].skew()
        Mean.append(M)
        Variance.append(V)
        Skewness.append(S)
    
    stat_df=pd.DataFrame(Mean,columns=["Mean"])
    stat_df["Variance"]=Variance
    stat_df["Skewness"]=Skewness
    return  stat_df



In [70]:
df1=clusterstatistics(dfY1,'Y1_A')
df1

Unnamed: 0,Mean,Variance,Skewness
0,-3.758419,19.155321,0.489122
1,-3.759994,19.160205,0.489298
2,-3.760495,19.164355,0.490223
3,-3.759761,19.172642,0.494952
4,-3.759309,19.180075,0.494372
5,-3.760815,19.186305,0.492553
6,-3.762739,19.183954,0.490637
7,-3.763783,19.191949,0.490026
8,-3.762095,19.2002,0.494902
9,-3.761468,19.214566,0.494461


In [232]:
doc_df.drop_duplicates(keep=False, inplace=True)
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2
26,1,X1_A,Y1_C,Z1_A,X2_C,Y2_C,Z2_A
179,1,X1_A,Y1_A,Z1_C,X2_A,Y2_B,Z2_A
182,1,X1_B,Y1_A,Z1_B,X2_A,Y2_B,Z2_C
188,1,X1_B,Y1_B,Z1_B,X2_B,Y2_B,Z2_C
192,1,X1_B,Y1_D,Z1_B,X2_B,Y2_A,Z2_C
...,...,...,...,...,...,...,...
56514,3,X1_B,Y1_B,Z1_A,X2_B,Y2_B,Z2_A
56522,3,X1_A,Y1_A,Z1_B,X2_A,Y2_B,Z2_B
56523,3,X1_C,Y1_D,Z1_B,X2_A,Y2_B,Z2_B
56531,3,X1_A,Y1_A,Z1_B,X2_B,Y2_C,Z2_C


In [233]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,activityID,X1,Y1,Z1,X2,Y2,Z2,final_sub_sequence
26,1,X1_A,Y1_C,Z1_A,X2_C,Y2_C,Z2_A,X1_A Y1_C Z1_A X2_C Y2_C Z2_A
179,1,X1_A,Y1_A,Z1_C,X2_A,Y2_B,Z2_A,X1_A Y1_A Z1_C X2_A Y2_B Z2_A
182,1,X1_B,Y1_A,Z1_B,X2_A,Y2_B,Z2_C,X1_B Y1_A Z1_B X2_A Y2_B Z2_C
188,1,X1_B,Y1_B,Z1_B,X2_B,Y2_B,Z2_C,X1_B Y1_B Z1_B X2_B Y2_B Z2_C
192,1,X1_B,Y1_D,Z1_B,X2_B,Y2_A,Z2_C,X1_B Y1_D Z1_B X2_B Y2_A Z2_C
...,...,...,...,...,...,...,...,...
56514,3,X1_B,Y1_B,Z1_A,X2_B,Y2_B,Z2_A,X1_B Y1_B Z1_A X2_B Y2_B Z2_A
56522,3,X1_A,Y1_A,Z1_B,X2_A,Y2_B,Z2_B,X1_A Y1_A Z1_B X2_A Y2_B Z2_B
56523,3,X1_C,Y1_D,Z1_B,X2_A,Y2_B,Z2_B,X1_C Y1_D Z1_B X2_A Y2_B Z2_B
56531,3,X1_A,Y1_A,Z1_B,X2_B,Y2_C,Z2_C,X1_A Y1_A Z1_B X2_B Y2_C Z2_C


In [234]:
# _save the combined values to text files
for activity in doc_df['activityID'].unique():
    output_filepath = os.getcwd() + f'/../data/sub_sequence_output/activity_subseq_' + str(activity) + '.txt'
    doc_df.loc[doc_df['activityID'] == activity][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)