In [11]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats 


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/output_csv/processed_data.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path,names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,2.21530,8.27915,5.58753,-0.004750,0.037579,-0.011145
1,37.67,1,2.29196,7.67288,5.74467,-0.171710,0.025479,-0.009538
2,37.68,1,2.29090,7.14240,5.82342,-0.238241,0.011214,0.000831
3,37.69,1,2.21800,7.14365,5.89930,-0.192912,0.019053,0.013374
4,37.70,1,2.30106,7.25857,6.09259,-0.069961,-0.018328,0.004582
...,...,...,...,...,...,...,...,...
1564137,3409.07,7,-1.68428,-8.97338,3.43203,-0.231392,-0.391747,0.180935
1564138,3409.08,7,-1.72527,-9.04964,3.35469,-0.252115,-0.338597,0.180709
1564139,3409.09,7,-1.53312,-8.97455,3.43429,-0.176675,-0.311570,0.172539
1564140,3409.10,7,-1.50362,-9.01479,3.20395,-0.173602,-0.291495,0.170721


# Generating subsequences for each sequence of the data

In [4]:
# _initializing variables
window_length = 10
window_overlap = 5
max_window_index = len(main_df.index)
sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [5]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)



(6, 312714, 10)


# Finding the statistics of the subsequences(mean,variance,skewness,IQR)


In [9]:
def subsequence_statistics(n):
    subsequences=np_sequences[n]
    Mean=[]
    Variance=[]
    Skewness=[]
    IQR=[]
    for i in range(0,len(subsequences)):
        mean=sum(subsequences[i])/len(subsequences[i])
        Mean.append(mean)
        variance = sum((x-mean)**2 for x in subsequences[i]) / len(subsequences[i])
        Variance.append(variance)
        skewness=stats.skew(subsequences[i])
        Skewness.append(skewness)
        q3, q1 = np.percentile(subsequences[i], [75 ,25])
        iqr = q3 - q1
        IQR.append(iqr)
    return Mean,Variance,Skewness,IQR
    

In [10]:
Mean,Variance,Skewness,IQR=subsequence_statistics(0)
data = list(zip(Mean,Variance,Skewness,IQR))
X1_statistic_df=pd.DataFrame(data,columns=['mean','variance','skewness','IQR'])

In [12]:
X1_statistic_df

Unnamed: 0,mean,variance,skewness,IQR
0,2.252053,0.008409,-0.336519,0.082810
1,2.275643,0.012021,-0.274613,0.133798
2,2.255907,0.007474,0.640422,0.057278
3,2.209783,0.006058,0.898379,0.096855
4,2.214889,0.008036,0.179849,0.106895
...,...,...,...,...
312709,-1.183654,0.036115,-0.337586,0.277469
312710,-1.225540,0.024752,0.742694,0.211905
312711,-1.427101,0.017851,-0.946551,0.152780
312712,-1.569999,0.012696,0.620479,0.112880


In [13]:
Mean1,Variance1,Skewness1,IQR1=subsequence_statistics(0)
data1 = list(zip(Mean1,Variance1,Skewness1,IQR1))
Y1_statistic_df=pd.DataFrame(data1,columns=['mean','variance','skewness','IQR'])

In [14]:
Y1_statistic_df

Unnamed: 0,mean,variance,skewness,IQR
0,2.252053,0.008409,-0.336519,0.082810
1,2.275643,0.012021,-0.274613,0.133798
2,2.255907,0.007474,0.640422,0.057278
3,2.209783,0.006058,0.898379,0.096855
4,2.214889,0.008036,0.179849,0.106895
...,...,...,...,...
312709,-1.183654,0.036115,-0.337586,0.277469
312710,-1.225540,0.024752,0.742694,0.211905
312711,-1.427101,0.017851,-0.946551,0.152780
312712,-1.569999,0.012696,0.620479,0.112880


In [15]:
Mean2,Variance2,Skewness2,IQR2=subsequence_statistics(0)
data2 = list(zip(Mean2,Variance2,Skewness2,IQR2))
Z1_statistic_df=pd.DataFrame(data2,columns=['mean','variance','skewness','IQR'])

In [16]:
Z1_statistic_df

Unnamed: 0,mean,variance,skewness,IQR
0,2.252053,0.008409,-0.336519,0.082810
1,2.275643,0.012021,-0.274613,0.133798
2,2.255907,0.007474,0.640422,0.057278
3,2.209783,0.006058,0.898379,0.096855
4,2.214889,0.008036,0.179849,0.106895
...,...,...,...,...
312709,-1.183654,0.036115,-0.337586,0.277469
312710,-1.225540,0.024752,0.742694,0.211905
312711,-1.427101,0.017851,-0.946551,0.152780
312712,-1.569999,0.012696,0.620479,0.112880


In [17]:
Mean3,Variance3,Skewness3,IQR3=subsequence_statistics(0)
data3 = list(zip(Mean3,Variance3,Skewness3,IQR3))
X2_statistic_df=pd.DataFrame(data3,columns=['mean','variance','skewness','IQR'])

In [18]:
X2_statistic_df

Unnamed: 0,mean,variance,skewness,IQR
0,2.252053,0.008409,-0.336519,0.082810
1,2.275643,0.012021,-0.274613,0.133798
2,2.255907,0.007474,0.640422,0.057278
3,2.209783,0.006058,0.898379,0.096855
4,2.214889,0.008036,0.179849,0.106895
...,...,...,...,...
312709,-1.183654,0.036115,-0.337586,0.277469
312710,-1.225540,0.024752,0.742694,0.211905
312711,-1.427101,0.017851,-0.946551,0.152780
312712,-1.569999,0.012696,0.620479,0.112880


In [19]:
Mean4,Variance4,Skewness4,IQR4=subsequence_statistics(0)
data4 = list(zip(Mean1,Variance1,Skewness1,IQR1))
Y2_statistic_df=pd.DataFrame(data1,columns=['mean','variance','skewness','IQR'])

In [20]:
Y2_statistic_df

Unnamed: 0,mean,variance,skewness,IQR
0,2.252053,0.008409,-0.336519,0.082810
1,2.275643,0.012021,-0.274613,0.133798
2,2.255907,0.007474,0.640422,0.057278
3,2.209783,0.006058,0.898379,0.096855
4,2.214889,0.008036,0.179849,0.106895
...,...,...,...,...
312709,-1.183654,0.036115,-0.337586,0.277469
312710,-1.225540,0.024752,0.742694,0.211905
312711,-1.427101,0.017851,-0.946551,0.152780
312712,-1.569999,0.012696,0.620479,0.112880


In [21]:
Mean5,Variance5,Skewness5,IQR5=subsequence_statistics(0)
data5 = list(zip(Mean5,Variance5,Skewness5,IQR5))
Z2_statistic_df=pd.DataFrame(data5,columns=['mean','variance','skewness','IQR'])

In [22]:
Z2_statistic_df

Unnamed: 0,mean,variance,skewness,IQR
0,2.252053,0.008409,-0.336519,0.082810
1,2.275643,0.012021,-0.274613,0.133798
2,2.255907,0.007474,0.640422,0.057278
3,2.209783,0.006058,0.898379,0.096855
4,2.214889,0.008036,0.179849,0.106895
...,...,...,...,...
312709,-1.183654,0.036115,-0.337586,0.277469
312710,-1.225540,0.024752,0.742694,0.211905
312711,-1.427101,0.017851,-0.946551,0.152780
312712,-1.569999,0.012696,0.620479,0.112880
