In [49]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats, spatial
import statistics

import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/output_csv/processed_data_test.csv'
col_names = ['subject_id', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [20]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path,names=col_names)
main_df = main_df.astype({'subject_id': int, 'activityID': int})
main_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,5,1.041216,-0.269796,0.023780,0.437464,0.531349,0.136528
1,1,5,1.041803,-0.280025,0.076293,0.468264,0.721069,0.097622
2,1,5,1.039086,-0.292663,0.147475,0.498257,0.520328,0.083556
3,1,5,1.054768,-0.292384,0.139906,0.479396,0.372625,0.022861
4,1,5,1.028376,-0.285826,0.119934,0.389894,0.414541,-0.025939
...,...,...,...,...,...,...,...,...
377211,2947,2,0.908386,-0.423054,-0.092933,0.148295,-0.015923,0.109040
377212,2947,2,0.898984,-0.392272,-0.063138,0.143136,-0.024389,0.006547
377213,2947,2,0.918862,-0.351680,-0.072539,0.095931,-0.021024,-0.051342
377214,2947,2,0.949475,-0.267526,-0.050975,0.090708,-0.041893,-0.078877


# Generating subsequences for each sequence of the data

In [21]:
# _initializing variables
window_length = 40
window_overlap = 10
max_window_index = len(main_df.index)
sequence_names = col_names
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [22]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[1]][window_index:window_index+window_length].tolist()
    subject_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()
    if len(set(activity_sequence)) == 1:
        sub_sequences[1].append(activity_sequence[0])
        sub_sequences[0].append(subject_sequence[0])
         
        for idx in range(2, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[2:])
print(np_sequences.shape)

(6, 37261, 40)


In [23]:
doc_df = pd.DataFrame(columns=col_names)
doc_df['subject_id'] = sub_sequences[0]
doc_df['activityID'] = sub_sequences[1]
doc_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,5,,,,,,
1,1,5,,,,,,
2,1,5,,,,,,
3,1,5,,,,,,
4,1,5,,,,,,
...,...,...,...,...,...,...,...,...
37256,2947,2,,,,,,
37257,2947,2,,,,,,
37258,2947,2,,,,,,
37259,2947,2,,,,,,


# Finding the statistics of the subsequences(mean,variance,skewness,IQR)


In [24]:
def subsequence_statistics(n):
    subsequences=np_sequences[n]
    Mean=[]
    Standard_deviation=[]
    Skewness=[]
    IQR=[]
#     Min=[]
#     Max=[]
#     Median=[]
#     Range=[]
    Lower_quartile=[]
    Middle_quartile=[]
    Upper_quartile=[]
#     Coefficient_of_variation=[]
#     Kurtosis=[]
    for i in range(0,len(subsequences)):
        
        mean=sum(subsequences[i])/len(subsequences[i])
        Mean.append(mean)
        
        std=statistics.stdev(subsequences[i])
        Standard_deviation.append(std)
        
#         Cov=std/mean
#         Coefficient_of_variation.append(Cov)
        
#         minimum=min(subsequences[i])
#         Min.append(minimum)
        
#         maximum=max(subsequences[i])
#         Max.append(maximum)
        
#         range1=maximum-minimum
#         Range.append(range1)
        
        skewness=stats.skew(subsequences[i])
        Skewness.append(skewness)
        
#         median=statistics.median(subsequences[i])
#         Median.append(median)
        
        q3,q2, q1 = np.percentile(subsequences[i], [75 ,50,25])
        
        Lower_quartile.append(q1)
        
        Middle_quartile.append(q2)
        
        Upper_quartile.append(q3)
        
        iqr = q3 - q1
        IQR.append(iqr)
        
#         kurtosis=stats.kurtosis(subsequences[i])
#         Kurtosis.append(kurtosis)
        
    data = list(zip(Mean,Standard_deviation,Skewness,IQR))
    statistic_feature_df = pd.DataFrame(data,columns=['Mean','Standard_deviation','Skewness','IQR'])    
    
    return statistic_feature_df

In [None]:
statistics_list = []
for idx in range(0, np_sequences.shape[0]):
    statistic_df_axis = subsequence_statistics(idx)
    statistics_list.append(statistic_df_axis)
    print(idx)

In [9]:
#assigning words for each cluster
def get_assigned_words(seq_clusters, cluster_words, axis):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
    
    doc_df[axis] = assigned_words
    
    assigned_clusterWord = pd.DataFrame(data=assigned_words, columns=['cluster_word'])
            
    return assigned_clusterWord

In [10]:
#generating names for cluster count
def generate_cluster_names(sequence_names, cluster_cnt=100):
    
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+str(i) for i in range(cluster_cnt)]
        
    return words_dict

In [45]:
cluster_cnts = 250
words_dict = generate_cluster_names(sequence_names[2:], cluster_cnts)
sequence_cluster_cnts = dict.fromkeys(words_dict, cluster_cnts)

embeddings_filepath = os.getcwd() + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
embeddings_list = np.loadtxt(embeddings_filepath, delimiter=',')
channel_embeddings = [embeddings_list[cluster_cnts*x:cluster_cnts*(x+1)] for x in range(len(col_names[2:]))]

channel_embeddings_dict = {}
for idx, val in enumerate(col_names[2:]):
    channel_embeddings_dict[val] = channel_embeddings[idx]

In [48]:
for val in col_names[2:]:
    assert len(words_dict[val]) == len(channel_embeddings_dict[val])

In [57]:
predicted_clusters = []
for statistic_df, axis in zip(statistics_list, col_names[2:]):
    
    test_sequences = statistic_df.values
    cluster_names = words_dict[axis]
    cluster_embeddings = channel_embeddings_dict[axis]
    
    tree = spatial.KDTree(cluster_embeddings)
    closest_clusters = []
    for seq in test_sequences:
        closest_vector = tree.query(seq)
        closest_clusters.append(cluster_names[closest_vector[1]])
    
    predicted_clusters.append(closest_clusters)

In [58]:
predicted_clusters = np.asarray(predicted_clusters)
predicted_clusters.shape

(6, 37261)

In [59]:
for idx, col in enumerate(col_names[2:]):
    doc_df[col] = predicted_clusters[idx]

In [60]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,final_sub_sequence
0,1,5,X1_25,Y1_84,Z1_134,X2_196,Y2_49,Z2_161,X1_25 Y1_84 Z1_134 X2_196 Y2_49 Z2_161
1,1,5,X1_5,Y1_156,Z1_50,X2_247,Y2_27,Z2_129,X1_5 Y1_156 Z1_50 X2_247 Y2_27 Z2_129
2,1,5,X1_245,Y1_171,Z1_195,X2_196,Y2_142,Z2_161,X1_245 Y1_171 Z1_195 X2_196 Y2_142 Z2_161
3,1,5,X1_216,Y1_91,Z1_238,X2_167,Y2_29,Z2_102,X1_216 Y1_91 Z1_238 X2_167 Y2_29 Z2_102
4,1,5,X1_121,Y1_171,Z1_9,X2_220,Y2_99,Z2_32,X1_121 Y1_171 Z1_9 X2_220 Y2_99 Z2_32
...,...,...,...,...,...,...,...,...,...
37256,2947,2,X1_14,Y1_56,Z1_146,X2_179,Y2_168,Z2_105,X1_14 Y1_56 Z1_146 X2_179 Y2_168 Z2_105
37257,2947,2,X1_62,Y1_237,Z1_135,X2_59,Y2_8,Z2_35,X1_62 Y1_237 Z1_135 X2_59 Y2_8 Z2_35
37258,2947,2,X1_124,Y1_55,Z1_217,X2_59,Y2_172,Z2_214,X1_124 Y1_55 Z1_217 X2_59 Y2_172 Z2_214
37259,2947,2,X1_162,Y1_161,Z1_174,X2_179,Y2_195,Z2_245,X1_162 Y1_161 Z1_174 X2_179 Y2_195 Z2_245


In [61]:
# _save the combined values to text files
for subject in doc_df['subject_id'].unique():
    activity = doc_df.loc[(doc_df['subject_id'] == subject)]['activityID'].values[0]
    output_filepath = os.getcwd() + f'/../data/documents/test/activity_subseq_' + str(subject) +'_'+ str(activity) + '.txt'
    doc_df.loc[(doc_df['subject_id'] == subject)][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)