In [30]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats 
import statistics

import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/output_csv/processed_data_IMUwrist.csv'
col_names = ['timestamp (s)','subject_id', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2','X3', 'Y3', 'Z3']

In [34]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path,names=col_names)
main_df

Unnamed: 0,timestamp (s),subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3
0,858.05,101,17,2.43859,7.02674,5.74905,2.69200,7.44844,6.04521,-0.075248,0.070647,-0.040244
1,858.06,101,17,2.58814,6.98730,5.71293,2.68794,7.05573,5.95547,-0.102412,-0.011661,-0.035736
2,858.07,101,17,2.59135,6.87449,5.82882,2.74682,6.82882,6.01631,-0.039697,-0.004252,-0.019637
3,858.08,101,17,2.55276,6.95033,5.78950,2.76265,6.87406,6.06149,0.015871,0.023975,-0.032205
4,858.09,101,17,2.75106,7.14058,6.02276,2.79355,6.93427,6.09153,0.068200,-0.031930,-0.030819
...,...,...,...,...,...,...,...,...,...,...,...,...
46478,1425.34,101,16,1.88792,8.83351,3.65639,2.20546,8.89992,4.06432,0.145696,-0.077425,0.028052
46479,1425.35,101,16,1.95815,8.45286,3.58175,2.14378,8.70407,4.09502,0.114934,-0.079796,-0.007522
46480,1425.36,101,16,1.95682,8.26317,3.58237,2.12542,8.43225,3.97483,0.162078,-0.114195,-0.011359
46481,1425.37,101,16,2.11092,8.30061,3.66152,2.18330,8.26559,3.85433,0.278427,-0.076616,0.003072


# Generating subsequences for each sequence of the data

In [35]:
# _initializing variables
window_length = 100
window_overlap = 50
max_window_index = len(main_df.index)
sequence_names = col_names[1:]

num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [44]:
print(col_names[3:])

['X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2', 'X3', 'Y3', 'Z3']


In [36]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[1]][window_index:window_index+window_length].tolist()
    subject_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()
    if len(set(activity_sequence)) == 1:
        sub_sequences[1].append(activity_sequence[0])
        sub_sequences[0].append(subject_sequence[0])
         
        for idx in range(2, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[2:])
print(np_sequences.shape)

(9, 926, 100)


In [37]:
doc_df = pd.DataFrame(columns=col_names[1:])
doc_df['subject_id'] = sub_sequences[0]
doc_df['activityID'] = sub_sequences[1]
doc_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3
0,101,17,,,,,,,,,
1,101,17,,,,,,,,,
2,101,17,,,,,,,,,
3,101,17,,,,,,,,,
4,101,17,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
921,101,16,,,,,,,,,
922,101,16,,,,,,,,,
923,101,16,,,,,,,,,
924,101,16,,,,,,,,,


# Finding the statistics of the subsequences(mean,variance,skewness,IQR)


In [38]:
def subsequence_statistics(n):
    subsequences=np_sequences[n]
    Mean=[]
    Standard_deviation=[]
    Skewness=[]
    IQR=[]
    #Min=[]
    #Max=[]
    #Median=[]
    #Range=[]
    Lower_quartile=[]
    Middle_quartile=[]
    Upper_quartile=[]
    #Coefficient_of_variation=[]
    #Kurtosis=[]
    for i in range(0,len(subsequences)):
        
        mean=sum(subsequences[i])/len(subsequences[i])
        Mean.append(mean)
        
        std=statistics.stdev(subsequences[i])
        Standard_deviation.append(std)
        
        #Cov=std/mean
        #Coefficient_of_variation.append(Cov)
        
        #minimum=min(subsequences[i])
        #Min.append(minimum)
        
        #maximum=max(subsequences[i])
        #Max.append(maximum)
        
        #range1=maximum-minimum
        #Range.append(range1)
        
        skewness=stats.skew(subsequences[i])
        Skewness.append(skewness)
        
        #median=statistics.median(subsequences[i])
        #Median.append(median)
        
        q3,q2, q1 = np.percentile(subsequences[i], [75 ,50,25])
        
        Lower_quartile.append(q1)
        
        Middle_quartile.append(q2)
        
        Upper_quartile.append(q3)
        
        iqr = q3 - q1
        IQR.append(iqr)
        
        #kurtosis=stats.kurtosis(subsequences[i])
        #Kurtosis.append(kurtosis)
        
    #data = list(zip(Mean,Standard_deviation,Skewness,IQR,Min,Max,Median,Range,Lower_quartile,Middle_quartile,Upper_quartile,Coefficient_of_variation,Kurtosis))
    #statistic_feature_df = pd.DataFrame(data,columns=['Mean','Standard_deviation','Skewness','IQR','Min','Max','Median','Range','Lower_quartile','Middle_quartile','Upper_quartile','Coefficient_of_variation','Kurtosis'])    
    
    data = list(zip(Mean,Standard_deviation,Skewness,IQR,Lower_quartile,Middle_quartile,Upper_quartile))
    statistic_feature_df = pd.DataFrame(data,columns=['Mean','Standard_deviation','Skewness','IQR','Lower_quartile','Middle_quartile','Upper_quartile'])    

    return statistic_feature_df

In [39]:
statistics_list = []
for idx in range(0, np_sequences.shape[0]):
    statistic_df_axis = subsequence_statistics(idx)
    statistics_list.append(statistic_df_axis)
    print(idx)
print(statistics_list)

0
1
2
3
4
5
6
7
8
[         Mean  Standard_deviation  Skewness       IQR  Lower_quartile  \
0    2.721251            0.482206  1.951495  0.395800        2.432538   
1    1.606127            1.550904 -0.340363  2.185999        0.551343   
2   -0.813181            1.437666  0.305822  2.424655       -1.873890   
3   -2.021290            0.694737 -0.216391  0.911707       -2.455570   
4   -1.640554            0.838320 -0.375580  1.444237       -2.345215   
..        ...                 ...       ...       ...             ...   
921  0.555509            2.156295 -1.148715  2.977791       -0.911281   
922  3.654967            2.074996  0.209243  4.160235        1.903828   
923  4.877547            1.491866 -0.656662  2.701335        3.407043   
924  3.295223            1.445371  0.966358  2.073045        2.209420   
925  2.157693            0.369546 -0.310115  0.480330        1.923777   

     Middle_quartile  Upper_quartile  
0           2.580790        2.828337  
1           2.155470      

In [45]:
#assigning words for each cluster
def get_assigned_words(seq_clusters, cluster_words, axis):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
    
    doc_df[axis] = assigned_words
    
    assigned_clusterWord = pd.DataFrame(data=assigned_words, columns=['cluster_word'])
            
    return assigned_clusterWord

In [54]:
#generating names for cluster count
def generate_cluster_names(sequence_names, cluster_cnts):
    
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+str(i) for i in range(cluster_cnts)]
        
    return words_dict

In [55]:
cluster_cnts = 10
words_dict = generate_cluster_names(sequence_names[2:], 10)
#print(words_dict)
sequence_cluster_cnts = dict.fromkeys(words_dict, cluster_cnts)
#print(sequence_cluster_cnts)

In [68]:
def clustering(statistic_df, axis):
    
    model = KMeans(n_clusters=cluster_cnts).fit(statistic_df)
    cluster_ids = pd.DataFrame(model.predict(statistic_df), columns=['cluster ID'])
    cluster_words = words_dict[axis][:cluster_cnts]
    seq_clusters = cluster_ids.to_numpy()
    assigned_clusterWord = get_assigned_words(seq_clusters, cluster_words, axis)

    centroids_of_clusters = pd.DataFrame(model.cluster_centers_[cluster_ids['cluster ID']], 
                     columns=['Mean_c','Standard_deviation_c','Skewness_c','IQR_c','Lower_quartile_c','Middle_quartile_c','Upper_quartile_c'])
    result = pd.concat([assigned_clusterWord, centroids_of_clusters], axis=1)  
    result = result.drop_duplicates()

    return result

In [57]:
def cluster_word_sort(axis_clusters,cluster_names):
    
    result = axis_clusters.loc[(axis_clusters['cluster_word'] == cluster_names)]
    
    return result.iloc[:, 1:]
    

In [69]:
clusters_centroid = []
centroid_statistic = []

for statistic_df, axis in zip(statistics_list, col_names[3:]):
    
    axis_clusters = clustering(statistic_df, axis)
    print(axis_clusters)
    clusters_centroid.append(axis_clusters)
    cluster_names = words_dict[axis]
    for j in range(len(cluster_names)):
        cluster_stats = cluster_word_sort(axis_clusters,cluster_names[j])
        centroid_statistic.append(cluster_stats)

print(centroid_statistic)

    cluster_word    Mean_c  Standard_deviation_c  Skewness_c     IQR_c  \
0           X1_5  2.682556              1.981156   -0.349852  2.850634   
2           X1_3 -2.165044              1.767723   -0.149932  2.374776   
8           X1_0 -3.987070              1.771302    0.115824  2.519254   
16          X1_2 -0.761821              4.373389   -0.153411  7.216442   
17          X1_4 -7.176431              4.390530   -0.234849  5.064052   
18          X1_9 -7.097860              6.161658   -0.279000  8.508186   
19          X1_6 -6.603504              2.331694   -0.486846  2.550062   
20          X1_8 -3.550233              3.700389   -0.130602  5.473485   
480         X1_7 -9.229661              2.932422    0.082075  3.432842   
495         X1_1 -9.009303              5.506290   -0.036830  7.040201   

     Lower_quartile_c  Middle_quartile_c  Upper_quartile_c  
0            1.322429           3.032557          4.173063  
2           -3.305896          -2.144959         -0.931120  
8 

   cluster_word    Mean_c  Standard_deviation_c  Skewness_c     IQR_c  \
0          X3_8 -0.294275              0.685014   -0.879194  0.767398   
1          X3_7  0.018496              0.560431    0.025945  0.719264   
2          X3_9  0.584049              0.793140   -0.133684  1.046246   
9          X3_5  0.163751              0.946733    1.127250  0.979276   
13         X3_1 -0.546971              1.020110    0.189582  1.323281   
14         X3_2 -0.011516              1.002361   -0.170344  1.465496   
18         X3_6 -0.136484              1.704979   -0.201857  2.288191   
22         X3_0  0.094017              1.120278   -1.242878  1.042937   
57         X3_3  0.867005              1.458723    0.347513  1.951978   
93         X3_4  0.532506              2.329384   -0.144856  3.622551   

    Lower_quartile_c  Middle_quartile_c  Upper_quartile_c  
0          -0.622326          -0.187624          0.145071  
1          -0.349209           0.025460          0.370055  
2           0.06

In [70]:
embeddings_filepath = os.getcwd() + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
pd.concat(centroid_statistic).to_csv(embeddings_filepath, index=False, header= False)

In [72]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[3:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,X3,Y3,Z3,final_sub_sequence
0,101,17,X1_5,Y1_1,Z1_6,X2_5,Y2_3,Z2_9,X3_8,Y3_4,Z3_5,X1_5 Y1_1 Z1_6 X2_5 Y2_3 Z2_9 X3_8 Y3_4 Z3_5
1,101,17,X1_5,Y1_6,Z1_6,X2_5,Y2_9,Z2_9,X3_7,Y3_1,Z3_0,X1_5 Y1_6 Z1_6 X2_5 Y2_9 Z2_9 X3_7 Y3_1 Z3_0
2,101,17,X1_3,Y1_6,Z1_3,X2_2,Y2_9,Z2_5,X3_9,Y3_1,Z3_2,X1_3 Y1_6 Z1_3 X2_2 Y2_9 Z2_5 X3_9 Y3_1 Z3_2
3,101,17,X1_3,Y1_1,Z1_5,X2_2,Y2_3,Z2_6,X3_9,Y3_1,Z3_2,X1_3 Y1_1 Z1_5 X2_2 Y2_3 Z2_6 X3_9 Y3_1 Z3_2
4,101,17,X1_3,Y1_1,Z1_5,X2_2,Y2_3,Z2_6,X3_8,Y3_4,Z3_0,X1_3 Y1_1 Z1_5 X2_2 Y2_3 Z2_6 X3_8 Y3_4 Z3_0
...,...,...,...,...,...,...,...,...,...,...,...,...
921,101,16,X1_5,Y1_1,Z1_3,X2_5,Y2_3,Z2_9,X3_2,Y3_6,Z3_4,X1_5 Y1_1 Z1_3 X2_5 Y2_3 Z2_9 X3_2 Y3_6 Z3_4
922,101,16,X1_5,Y1_6,Z1_3,X2_5,Y2_9,Z2_4,X3_1,Y3_2,Z3_9,X1_5 Y1_6 Z1_3 X2_5 Y2_9 Z2_4 X3_1 Y3_2 Z3_9
923,101,16,X1_5,Y1_6,Z1_0,X2_5,Y2_9,Z2_0,X3_1,Y3_2,Z3_7,X1_5 Y1_6 Z1_0 X2_5 Y2_9 Z2_0 X3_1 Y3_2 Z3_7
924,101,16,X1_5,Y1_1,Z1_0,X2_5,Y2_3,Z2_0,X3_7,Y3_1,Z3_1,X1_5 Y1_1 Z1_0 X2_5 Y2_3 Z2_0 X3_7 Y3_1 Z3_1


In [74]:
# _save the combined values to text files
for subject in doc_df['subject_id'].unique():
    for activity in doc_df['activityID'].unique():
        output_filepath = os.getcwd() + f'/../data/documents/Different sensors/activity_subseq_' + str(subject) +'_'+ str(activity) + '.txt'
        doc_df.loc[((doc_df['activityID'] == activity) & (doc_df['subject_id'] == subject))][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)