In [1]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats 
import statistics

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/output_csv/processed_data_train.csv'
col_names = ['subject_id', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path,names=col_names)
main_df = main_df.astype({'subject_id': int, 'activityID': int})
main_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,5,1.012817,-0.123217,0.102934,0.030191,0.066014,0.022859
1,1,5,1.022833,-0.126876,0.105687,0.043711,0.042699,0.010316
2,1,5,1.022028,-0.124004,0.102102,0.035688,0.074850,0.013250
3,1,5,1.017877,-0.124928,0.106553,0.040402,0.057320,0.017751
4,1,5,1.023680,-0.125767,0.102814,0.047097,0.052343,0.002553
...,...,...,...,...,...,...,...,...
941051,7352,2,0.991497,-0.486260,-0.205803,0.973228,1.083094,-0.226884
941052,7352,2,0.945067,-0.453405,-0.180733,1.004266,1.187832,-0.313591
941053,7352,2,0.898095,-0.397775,-0.156105,1.004855,1.156645,-0.362512
941054,7352,2,0.828372,-0.349247,-0.122798,1.015589,1.100750,-0.383989


# Generating subsequences for each sequence of the data

In [4]:
# _initializing variables
window_length = 40
window_overlap = 10
max_window_index = len(main_df.index)
sequence_names = col_names
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [5]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[1]][window_index:window_index+window_length].tolist()
    subject_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()
    if len(set(activity_sequence)) == 1:
        sub_sequences[1].append(activity_sequence[0])
        sub_sequences[0].append(subject_sequence[0])
         
        for idx in range(2, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[2:])
print(np_sequences.shape)

(6, 93051, 40)


In [6]:
doc_df = pd.DataFrame(columns=col_names)
doc_df['subject_id'] = sub_sequences[0]
doc_df['activityID'] = sub_sequences[1]
doc_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2
0,1,5,,,,,,
1,1,5,,,,,,
2,1,5,,,,,,
3,1,5,,,,,,
4,1,5,,,,,,
...,...,...,...,...,...,...,...,...
93046,7352,2,,,,,,
93047,7352,2,,,,,,
93048,7352,2,,,,,,
93049,7352,2,,,,,,


# Finding the statistics of the subsequences(mean,variance,skewness,IQR)


In [7]:
def subsequence_statistics(n):
    subsequences=np_sequences[n]
    Mean=[]
    Standard_deviation=[]
    Skewness=[]
    IQR=[]
#     Min=[]
#     Max=[]
#     Median=[]
#     Range=[]
    Lower_quartile=[]
    Middle_quartile=[]
    Upper_quartile=[]
#     Coefficient_of_variation=[]
#     Kurtosis=[]
    for i in range(0,len(subsequences)):
        
        mean=sum(subsequences[i])/len(subsequences[i])
        Mean.append(mean)
        
        std=statistics.stdev(subsequences[i])
        Standard_deviation.append(std)
        
#         Cov=std/mean
#         Coefficient_of_variation.append(Cov)
        
#         minimum=min(subsequences[i])
#         Min.append(minimum)
        
#         maximum=max(subsequences[i])
#         Max.append(maximum)
        
#         range1=maximum-minimum
#         Range.append(range1)
        
        skewness=stats.skew(subsequences[i])
        Skewness.append(skewness)
        
#         median=statistics.median(subsequences[i])
#         Median.append(median)
        
        q3,q2, q1 = np.percentile(subsequences[i], [75 ,50,25])
        
        Lower_quartile.append(q1)
        
        Middle_quartile.append(q2)
        
        Upper_quartile.append(q3)
        
        iqr = q3 - q1
        IQR.append(iqr)
        
#         kurtosis=stats.kurtosis(subsequences[i])
#         Kurtosis.append(kurtosis)
        
    data = list(zip(Mean,Standard_deviation,Skewness,IQR))
    statistic_feature_df = pd.DataFrame(data,columns=['Mean','Standard_deviation','Skewness','IQR'])    
    
    return statistic_feature_df

In [None]:
statistics_list = []
for idx in range(0, np_sequences.shape[0]):
    statistic_df_axis = subsequence_statistics(idx)
    statistics_list.append(statistic_df_axis)
    print(idx)

In [9]:
#assigning words for each cluster
def get_assigned_words(seq_clusters, cluster_words, axis):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
    
    doc_df[axis] = assigned_words
    
    assigned_clusterWord = pd.DataFrame(data=assigned_words, columns=['cluster_word'])
            
    return assigned_clusterWord

In [10]:
#generating names for cluster count
def generate_cluster_names(sequence_names, cluster_cnt=100):
    
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+str(i) for i in range(cluster_cnt)]
        
    return words_dict

In [11]:
cluster_cnts = 250
words_dict = generate_cluster_names(sequence_names[2:], cluster_cnts)
sequence_cluster_cnts = dict.fromkeys(words_dict, cluster_cnts)

In [12]:
def clustering(statistic_df, axis):
    
    model = KMeans(n_clusters=cluster_cnts).fit(statistic_df)
    cluster_ids = pd.DataFrame(model.predict(statistic_df), columns=['cluster ID'])
    cluster_words = words_dict[axis][:cluster_cnts]
    seq_clusters = cluster_ids.to_numpy()
    assigned_clusterWord = get_assigned_words(seq_clusters, cluster_words, axis)

    centroids_of_clusters = pd.DataFrame(model.cluster_centers_[cluster_ids['cluster ID']], 
                     columns=['Mean_c','Standard_deviation_c','Skewness_c','IQR_c'])
    result = pd.concat([assigned_clusterWord, centroids_of_clusters], axis=1)  
    result = result.drop_duplicates()

    return result

In [13]:
def cluster_word_sort(axis_clusters,cluster_names):
    
    result = axis_clusters.loc[(axis_clusters['cluster_word'] == cluster_names)]
    
    return result.iloc[:, 1:]
    

In [None]:
clusters_centroid = []
centroid_statistic = []

for statistic_df, axis in zip(statistics_list, col_names[2:]):
    
    axis_clusters = clustering(statistic_df, axis)
    #print(axis_clusters)
    clusters_centroid.append(axis_clusters)
    cluster_names = words_dict[axis]
    for j in range(len(cluster_names)):
        cluster_stats = cluster_word_sort(axis_clusters,cluster_names[j])
        centroid_statistic.append(cluster_stats)

In [15]:
embeddings_filepath = os.getcwd() + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
pd.concat(centroid_statistic).to_csv(embeddings_filepath, index=False, header= False)

In [16]:
# _combine individual words as documents
doc_df['final_sub_sequence'] = doc_df[col_names[2:]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
doc_df

Unnamed: 0,subject_id,activityID,X1,Y1,Z1,X2,Y2,Z2,final_sub_sequence
0,1,5,X1_241,Y1_210,Z1_50,X2_6,Y2_12,Z2_44,X1_241 Y1_210 Z1_50 X2_6 Y2_12 Z2_44
1,1,5,X1_18,Y1_178,Z1_111,X2_28,Y2_12,Z2_98,X1_18 Y1_178 Z1_111 X2_28 Y2_12 Z2_98
2,1,5,X1_241,Y1_10,Z1_100,X2_202,Y2_210,Z2_102,X1_241 Y1_10 Z1_100 X2_202 Y2_210 Z2_102
3,1,5,X1_5,Y1_95,Z1_50,X2_67,Y2_79,Z2_102,X1_5 Y1_95 Z1_50 X2_67 Y2_79 Z2_102
4,1,5,X1_245,Y1_20,Z1_9,X2_12,Y2_138,Z2_45,X1_245 Y1_20 Z1_9 X2_12 Y2_138 Z2_45
...,...,...,...,...,...,...,...,...,...
93046,7352,2,X1_238,Y1_205,Z1_62,X2_115,Y2_135,Z2_113,X1_238 Y1_205 Z1_62 X2_115 Y2_135 Z2_113
93047,7352,2,X1_114,Y1_162,Z1_207,X2_166,Y2_162,Z2_181,X1_114 Y1_162 Z1_207 X2_166 Y2_162 Z2_181
93048,7352,2,X1_146,Y1_205,Z1_176,X2_147,Y2_148,Z2_84,X1_146 Y1_205 Z1_176 X2_147 Y2_148 Z2_84
93049,7352,2,X1_170,Y1_204,Z1_160,X2_2,Y2_219,Z2_110,X1_170 Y1_204 Z1_160 X2_2 Y2_219 Z2_110


In [17]:
# _save the combined values to text files
for subject in doc_df['subject_id'].unique():
    activity = doc_df.loc[(doc_df['subject_id'] == subject)]['activityID'].values[0]
    output_filepath = os.getcwd() + f'/../data/documents/train/activity_subseq_' + str(subject) +'_'+ str(activity) + '.txt'
    doc_df.loc[(doc_df['subject_id'] == subject)][['final_sub_sequence']].to_csv(output_filepath, sep='\t', index=False, header= False)